Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -3970,6 +3970,10 @@ /// power-of-2 denominators. If the target returns an empty SDValue, LLVM /// assumes SDIV is expensive and replaces it with a series of other integer /// operations. + + SDValue BuildUREM(SDNode *Node, SelectionDAG &DAG, bool IsAfterLegalization, + SmallVectorImpl &Created) const; + virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const; Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3916,6 +3916,25 @@ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); + if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr) && + isConstantOrConstantVector(N1)) { + // check if there is a div to combine with rem. + unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; + SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(), {N0, N1}); + if (!DivNode) { + SmallVector Built; + SDValue OptimizedRem = + isSigned ? SDValue() /* placeholder for srem */ + : TLI.BuildUREM(N, DAG, LegalOperations, Built); + if (OptimizedRem.getNode()) { + for (SDNode *N : Built) { + AddToWorklist(N); + } + return OptimizedRem; + } + } + } + // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4893,6 +4893,116 @@ std::replace_if(Values.begin(), Values.end(), Predicate, Replacement); } +/// Given an ISD::UREM where the divisor is constant, +/// return a DAG expression that will generate the same result +/// using only multiplications, additions and shifts. +/// Ref: D. Lemire, O. Kaser, and N. Kurz, "Faster Remainder by Direct +/// Computation" (LKK) +SDValue TargetLowering::BuildUREM(SDNode *Node, SelectionDAG &DAG, + bool IsAfterLegalization, + SmallVectorImpl &Created) const { + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + EVT FVT; + if (VT.isVector()) { + EVT SVT = + EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + FVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorElementCount()); + } else { + FVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2); + } + + unsigned F = FVT.getScalarSizeInBits(); + + // when optimising for minimum size, we don't want to expand div + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + // Check to see if we can do this. + if (!isTypeLegal(VT) || !isTypeLegal(FVT)) + return SDValue(); + + // If MUL is unavailable, we cannot proceed in any case. + if (!isOperationLegalOrCustom(ISD::MUL, FVT)) + return SDValue(); + + SmallVector MagicFactors; + bool AllDivisorsArePowerOfTwo = true; + bool AllDivisorsAreOnes = true; + + auto BuildUREMPattern = [&](ConstantSDNode *DivisorConstant) { + // calculate magic number: c = ceil(2^N / d) + 1 + const APInt &D = DivisorConstant->getAPIntValue(); + APInt C = APInt::getMaxValue(F).udiv(D.zext(F)) + APInt(F, 1); + SDValue AproximateReciprocal = DAG.getConstant(C, DL, FVT.getScalarType()); + + MagicFactors.push_back(AproximateReciprocal); + + assert(!D.isNullValue() && "Divisor cannot be zero"); + + AllDivisorsArePowerOfTwo &= D.isPowerOf2(); + AllDivisorsAreOnes &= D.isOneValue(); + + if (!D.isStrictlyPositive()) { + // Divisor must be in the range of [1,2^N) + return false; + } + + return true; + }; + + // numerator + SDValue Numerator = Node->getOperand(0); + SDValue ExtendedNumerator = DAG.getZExtOrTrunc(Numerator, DL, FVT); + + // divisor constant + SDValue Divisor = Node->getOperand(1); + SDValue ExtendedDivisor = DAG.getZExtOrTrunc(Divisor, DL, FVT); + + if (!ISD::matchUnaryPredicate(Divisor, BuildUREMPattern)) + return SDValue(); + + // If this is a urem by a one, avoid the fold since it can be constant-folded. + if (AllDivisorsAreOnes) + return SDValue(); + + // If this is a urem by a powers-of-two, avoid the fold since it can be + // best implemented as a bit test. + if (AllDivisorsArePowerOfTwo) + return SDValue(); + + SDValue MagicFactor = VT.isVector() + ? DAG.getBuildVector(FVT, DL, MagicFactors) + : MagicFactors[0]; + + // lowbits = c * n + SDValue Lowbits = + DAG.getNode(ISD::MUL, DL, FVT, MagicFactor, ExtendedNumerator); + + // result = lowbits * d >> F + SDValue Result; + if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, FVT) + : isOperationLegalOrCustom(ISD::MULHU, FVT)) + Result = DAG.getNode(ISD::MULHU, DL, FVT, Lowbits, ExtendedDivisor); + else if (IsAfterLegalization + ? isOperationLegal(ISD::UMUL_LOHI, FVT) + : isOperationLegalOrCustom(ISD::UMUL_LOHI, FVT)) { + SDValue LoHi = DAG.getNode(ISD::UMUL_LOHI, DL, DAG.getVTList(FVT, FVT), + Lowbits, ExtendedDivisor); + Result = SDValue(LoHi.getNode(), 1); + } else { + return SDValue(); // No mulhu or equivalent + } + + Created.push_back(MagicFactor.getNode()); + Created.push_back(ExtendedNumerator.getNode()); + Created.push_back(Lowbits.getNode()); + Created.push_back(ExtendedDivisor.getNode()); + Created.push_back(Result.getNode()); + + return DAG.getZExtOrTrunc(Result, DL, VT); +} + /// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE /// where the divisor is constant and the comparison target is zero, /// return a DAG expression that will generate the same comparison result Index: llvm/test/CodeGen/AArch64/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-lkk.ll @@ -4,15 +4,15 @@ define i32 @fold_urem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_urem_positive_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: sub w9, w0, w8 -; CHECK-NEXT: add w8, w8, w9, lsr #1 -; CHECK-NEXT: lsr w8, w8, #6 +; CHECK-NEXT: mov x9, #7589 +; CHECK-NEXT: movk x9, #4139, lsl #16 +; CHECK-NEXT: movk x9, #55878, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #689, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 ; CHECK-NEXT: mov w9, #95 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %1 = urem i32 %x, 95 ret i32 %1 @@ -22,12 +22,15 @@ define i32 @fold_urem_positive_even(i32 %x) { ; CHECK-LABEL: fold_urem_positive_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #16323 -; CHECK-NEXT: movk w8, #63310, lsl #16 -; CHECK-NEXT: umull x8, w0, w8 -; CHECK-NEXT: lsr x8, x8, #42 +; CHECK-NEXT: mov x9, #7172 +; CHECK-NEXT: movk x9, #61579, lsl #16 +; CHECK-NEXT: movk x9, #54159, lsl #32 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: movk x9, #61, lsl #48 +; CHECK-NEXT: mul x8, x8, x9 ; CHECK-NEXT: mov w9, #1060 -; CHECK-NEXT: msub w0, w8, w9, w0 +; CHECK-NEXT: umulh x0, x8, x9 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 ; CHECK-NEXT: ret %1 = urem i32 %x, 1060 ret i32 %1 Index: llvm/test/CodeGen/AArch64/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -4,44 +4,16 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w11, #33437 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: movk w11, #21399, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: mov w9, #16913 -; CHECK-NEXT: mov w12, #98 -; CHECK-NEXT: lsr x11, x11, #37 -; CHECK-NEXT: movk w9, #8456, lsl #16 -; CHECK-NEXT: msub w10, w11, w12, w10 -; CHECK-NEXT: ubfx w12, w8, #2, #14 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: mov w11, #124 -; CHECK-NEXT: lsr x9, x9, #34 -; CHECK-NEXT: msub w8, w9, w11, w8 -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: umov w12, v0.h[0] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w12, w9 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: mov w11, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w9, w9, w11, w12 -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: mov w9, #2287 -; CHECK-NEXT: movk w9, #16727, lsl #16 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov w8, #1003 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: mov v0.h[2], w10 -; CHECK-NEXT: msub w8, w9, w8, w11 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x9, .LCPI0_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI0_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -50,43 +22,16 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umull x13, w8, w9 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: umull x14, w10, w9 -; CHECK-NEXT: lsr x13, x13, #32 -; CHECK-NEXT: umov w12, v0.h[3] -; CHECK-NEXT: umull x15, w11, w9 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: sub w16, w8, w13 -; CHECK-NEXT: umull x9, w12, w9 -; CHECK-NEXT: lsr x15, x15, #32 -; CHECK-NEXT: add w13, w13, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w15 -; CHECK-NEXT: add w15, w15, w16, lsr #1 -; CHECK-NEXT: sub w16, w12, w9 -; CHECK-NEXT: add w9, w9, w16, lsr #1 -; CHECK-NEXT: mov w16, #95 -; CHECK-NEXT: lsr w13, w13, #6 -; CHECK-NEXT: msub w8, w13, w16, w8 -; CHECK-NEXT: lsr w13, w14, #6 -; CHECK-NEXT: msub w10, w13, w16, w10 -; CHECK-NEXT: lsr w13, w15, #6 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: msub w11, w13, w16, w11 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: mov v0.h[1], w8 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: msub w8, w9, w16, w12 -; CHECK-NEXT: mov v0.h[3], w8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: movi v1.4s, #95 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -150,28 +95,16 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: movk w9, #22765, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w10, w8, w9 -; CHECK-NEXT: add w9, w9, w10, lsr #1 -; CHECK-NEXT: mov w10, #95 -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: and w9, w9, #0x3f -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: and w10, w10, #0x1f -; CHECK-NEXT: and w9, w9, #0x7 -; CHECK-NEXT: mov v1.h[1], w10 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: adrp x9, .LCPI3_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -181,34 +114,16 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #17097 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: movk w9, #45590, lsl #16 -; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: mov w10, #23 -; CHECK-NEXT: lsr x9, x9, #36 -; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: msub w8, w9, w10, w8 -; CHECK-NEXT: mov w9, #30865 -; CHECK-NEXT: movk w9, #51306, lsl #16 -; CHECK-NEXT: ubfx w10, w11, #1, #15 -; CHECK-NEXT: umull x9, w10, w9 -; CHECK-NEXT: mov w10, #654 -; CHECK-NEXT: lsr x9, x9, #40 -; CHECK-NEXT: msub w9, w9, w10, w11 -; CHECK-NEXT: mov w11, #47143 -; CHECK-NEXT: umov w10, v0.h[3] -; CHECK-NEXT: movk w11, #24749, lsl #16 -; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov w9, #5423 -; CHECK-NEXT: lsr x11, x11, #43 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: msub w8, w11, w9, w10 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: adrp x9, .LCPI4_1 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI4_1] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umull2 v1.2d, v0.4s, v2.4s +; CHECK-NEXT: umull v0.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 Index: llvm/test/CodeGen/ARM/urem-opt-size.ll =================================================================== --- llvm/test/CodeGen/ARM/urem-opt-size.ll +++ llvm/test/CodeGen/ARM/urem-opt-size.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; When optimising for minimum size, we don't want to expand a div to a mul ; and a shift sequence. As a result, the urem instruction e.g. will not be ; expanded to a sequence of umull, lsrs, muls and sub instructions, but @@ -14,20 +15,58 @@ target triple = "thumbv7m-arm-none-eabi" define i32 @foo1() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo1: -; CHECK:__aeabi_idiv -; CHECK-NOT: smmul +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: b __aeabi_idiv +; +; V7M-LABEL: foo1: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI0_0 +; V7M-NEXT: sdiv r0, r0, r1 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI0_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %div = sdiv i32 %call, 1000000 ret i32 %div } define i32 @foo2() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo2: -; CHECK: __aeabi_uidiv -; CHECK-NOT: umull +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: pop {r11, lr} +; CHECK-NEXT: b __aeabi_uidiv +; +; V7M-LABEL: foo2: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI1_0 +; V7M-NEXT: udiv r0, r0, r1 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI1_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %div = udiv i32 %call, 1000000 ret i32 %div @@ -35,14 +74,34 @@ ; Test for unsigned remainder define i32 @foo3() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo3: -; CHECK: __aeabi_uidivmod -; CHECK-NOT: umull +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: bl __aeabi_uidivmod +; CHECK-NEXT: clz r0, r1 +; CHECK-NEXT: lsr r0, r0, #5 +; CHECK-NEXT: pop {r11, pc} +; ; V7M-LABEL: foo3: -; V7M: udiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] -; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] -; V7M-NOT: __aeabi_uidivmod +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI2_0 +; V7M-NEXT: udiv r2, r0, r1 +; V7M-NEXT: mls r0, r2, r1, r0 +; V7M-NEXT: clz r0, r0 +; V7M-NEXT: lsrs r0, r0, #5 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI2_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %rem = urem i32 %call, 1000000 %cmp = icmp eq i32 %rem, 0 @@ -52,13 +111,31 @@ ; Test for signed remainder define i32 @foo4() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo4: -; CHECK:__aeabi_idivmod +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: bl __aeabi_idivmod +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: pop {r11, pc} +; ; V7M-LABEL: foo4: -; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] -; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] -; V7M-NOT: __aeabi_idivmod +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI3_0 +; V7M-NEXT: sdiv r2, r0, r1 +; V7M-NEXT: mls r0, r2, r1, r0 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI3_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %rem = srem i32 %call, 1000000 ret i32 %rem @@ -68,14 +145,32 @@ ; as the division needs to be computed anyway in order to calculate ; the remainder (i.e. make sure we don't end up with two divisions). define i32 @foo5() local_unnamed_addr #0 { -entry: ; CHECK-LABEL: foo5: -; CHECK:__aeabi_idivmod +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl GetValue +; CHECK-NEXT: movw r1, #16960 +; CHECK-NEXT: movt r1, #15 +; CHECK-NEXT: bl __aeabi_idivmod +; CHECK-NEXT: add r0, r0, r1 +; CHECK-NEXT: pop {r11, pc} +; ; V7M-LABEL: foo5: -; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]] -; V7M-NOT: sdiv -; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]] -; V7M-NOT: __aeabi_idivmod +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: bl GetValue +; V7M-NEXT: ldr r1, .LCPI4_0 +; V7M-NEXT: sdiv r2, r0, r1 +; V7M-NEXT: mls r0, r2, r1, r0 +; V7M-NEXT: add r0, r2 +; V7M-NEXT: pop {r7, pc} +; V7M-NEXT: .p2align 2 +; V7M-NEXT: @ %bb.1: +; V7M-NEXT: .LCPI4_0: +; V7M-NEXT: .long 1000000 @ 0xf4240 +entry: %call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)() %div = sdiv i32 %call, 1000000 %rem = srem i32 %call, 1000000 @@ -89,9 +184,36 @@ ; legalization and this optimisation. ; Function Attrs: norecurse nounwind define i64 @isel_dont_hang(i32 %bar) local_unnamed_addr #4 { -entry: ; CHECK-LABEL: isel_dont_hang: -; CHECK: __aeabi_uldivmod +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: .setfp r11, sp +; CHECK-NEXT: mov r11, sp +; CHECK-NEXT: asr r1, r0, #31 +; CHECK-NEXT: adds r2, r0, #2 +; CHECK-NEXT: adc r3, r1, #0 +; CHECK-NEXT: lsl r1, r1, #1 +; CHECK-NEXT: orr r1, r1, r0, lsr #31 +; CHECK-NEXT: lsl r0, r0, #1 +; CHECK-NEXT: bl __aeabi_uldivmod +; CHECK-NEXT: pop {r11, pc} +; +; V7M-LABEL: isel_dont_hang: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: .save {r7, lr} +; V7M-NEXT: push {r7, lr} +; V7M-NEXT: .setfp r7, sp +; V7M-NEXT: mov r7, sp +; V7M-NEXT: asrs r1, r0, #31 +; V7M-NEXT: adds r2, r0, #2 +; V7M-NEXT: adc r3, r1, #0 +; V7M-NEXT: lsl.w r1, r1, #1 +; V7M-NEXT: orr.w r1, r1, r0, lsr #31 +; V7M-NEXT: lsl.w r0, r0, #1 +; V7M-NEXT: bl __aeabi_uldivmod +; V7M-NEXT: pop {r7, pc} +entry: %temp.0 = sext i32 %bar to i64 %mul83 = shl i64 %temp.0, 1 %add84 = add i64 %temp.0, 2 @@ -101,10 +223,24 @@ ; i16 types are promoted to i32, and we expect a normal udiv here: define i16 @isel_dont_hang_2(i16 %bar) local_unnamed_addr #4 { -entry: ; CHECK-LABEL: isel_dont_hang_2: -; CHECK: udiv -; CHECK-NOT: __aeabi_ +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add r1, r0, #2 +; CHECK-NEXT: lsl r0, r0, #1 +; CHECK-NEXT: uxth r1, r1 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: udiv r0, r0, r1 +; CHECK-NEXT: bx lr +; +; V7M-LABEL: isel_dont_hang_2: +; V7M: @ %bb.0: @ %entry +; V7M-NEXT: adds r1, r0, #2 +; V7M-NEXT: lsl.w r0, r0, #1 +; V7M-NEXT: uxth r1, r1 +; V7M-NEXT: uxth r0, r0 +; V7M-NEXT: udiv r0, r0, r1 +; V7M-NEXT: bx lr +entry: %mul83 = shl i16 %bar, 1 %add84 = add i16 %bar, 2 %div85 = udiv i16 %mul83, %add84 Index: llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -11,113 +11,116 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 +; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: lis r5, 21399 -; P9LE-NEXT: ori r5, r5, 33437 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: lis r5, 16727 -; P9LE-NEXT: ori r5, r5, 2287 -; P9LE-NEXT: rldicl r4, r4, 27, 37 -; P9LE-NEXT: mulli r4, r4, 98 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 95 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 528 +; P9LE-NEXT: ori r4, r4, 33825 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: oris r4, r4, 2114 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: lis r5, 8456 -; P9LE-NEXT: ori r5, r5, 16913 -; P9LE-NEXT: rldicl r4, r4, 24, 40 -; P9LE-NEXT: mulli r4, r4, 1003 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 4229 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 124 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 668 +; P9LE-NEXT: ori r4, r4, 48148 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: oris r4, r4, 58848 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 30, 34 -; P9LE-NEXT: mulli r4, r4, 124 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 42800 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 98 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 65 +; P9LE-NEXT: ori r4, r4, 22280 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 61158 +; P9LE-NEXT: ori r4, r4, 14506 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 1003 +; P9LE-NEXT: mulhdu r3, r3, r4 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 -; P9LE-NEXT: vmrglw v2, v3, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 +; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 65 +; P9BE-NEXT: ori r4, r4, 22280 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 16727 -; P9BE-NEXT: ori r5, r5, 2287 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: lis r5, 21399 -; P9BE-NEXT: ori r5, r5, 33437 -; P9BE-NEXT: rldicl r4, r4, 24, 40 -; P9BE-NEXT: mulli r4, r4, 1003 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: oris r4, r4, 61158 +; P9BE-NEXT: ori r4, r4, 14506 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 1003 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 668 +; P9BE-NEXT: ori r4, r4, 48148 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 58848 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: ori r4, r4, 42800 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: lis r5, 8456 -; P9BE-NEXT: ori r5, r5, 16913 -; P9BE-NEXT: rldicl r4, r4, 27, 37 -; P9BE-NEXT: mulli r4, r4, 98 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 98 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 528 +; P9BE-NEXT: ori r4, r4, 33825 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 2114 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: ori r4, r4, 4229 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9BE-NEXT: mulld r3, r3, r5 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: rldicl r3, r3, 30, 34 -; P9BE-NEXT: mulli r3, r3, 124 -; P9BE-NEXT: subf r3, r3, r4 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 124 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 +; P9BE-NEXT: oris r4, r4, 4139 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r4, r4, 7589 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 95 +; P9BE-NEXT: mulhdu r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 @@ -127,110 +130,113 @@ ; P8LE-LABEL: fold_urem_vec_1: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: lis r8, 21399 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: ori r8, r8, 33437 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: clrldi r5, r4, 48 -; P8LE-NEXT: rldicl r9, r4, 32, 48 -; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 -; P8LE-NEXT: rldicl r10, r4, 16, 48 -; P8LE-NEXT: rlwinm r11, r9, 0, 16, 31 -; P8LE-NEXT: clrldi r7, r6, 32 -; P8LE-NEXT: rlwinm r12, r10, 0, 16, 31 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: lis r4, 528 +; P8LE-NEXT: lis r5, 668 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: ori r4, r4, 33825 +; P8LE-NEXT: ori r5, r5, 48148 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r6, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: oris r4, r4, 2114 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: sldi r5, r5, 32 +; P8LE-NEXT: ori r4, r4, 4229 +; P8LE-NEXT: clrldi r7, r6, 48 +; P8LE-NEXT: rldicl r8, r6, 48, 48 +; P8LE-NEXT: oris r5, r5, 58848 +; P8LE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8LE-NEXT: ori r5, r5, 42800 ; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: lis r7, 16727 -; P8LE-NEXT: ori r7, r7, 2287 -; P8LE-NEXT: mulld r8, r11, r8 -; P8LE-NEXT: lis r11, 8456 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r7, r12, r7 -; P8LE-NEXT: ori r11, r11, 16913 -; P8LE-NEXT: rlwinm r12, r4, 30, 18, 31 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: mulld r11, r12, r11 -; P8LE-NEXT: subf r6, r3, r6 -; P8LE-NEXT: rldicl r8, r8, 27, 37 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: rldicl r6, r7, 24, 40 -; P8LE-NEXT: mulli r7, r8, 98 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: rldicl r8, r11, 30, 34 -; P8LE-NEXT: mulli r6, r6, 1003 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: mulli r8, r8, 124 -; P8LE-NEXT: subf r7, r7, r9 -; P8LE-NEXT: subf r6, r6, r10 -; P8LE-NEXT: mtvsrd f0, r7 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: subf r4, r8, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: lis r7, 65 +; P8LE-NEXT: ori r7, r7, 22280 +; P8LE-NEXT: mulld r4, r8, r4 +; P8LE-NEXT: rldicl r8, r6, 32, 48 +; P8LE-NEXT: sldi r7, r7, 32 +; P8LE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8LE-NEXT: oris r7, r7, 61158 +; P8LE-NEXT: rldicl r6, r6, 16, 48 +; P8LE-NEXT: mulld r5, r8, r5 +; P8LE-NEXT: ori r7, r7, 14506 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: mulld r6, r6, r7 +; P8LE-NEXT: li r7, 95 +; P8LE-NEXT: mulhdu r3, r3, r7 +; P8LE-NEXT: li r7, 124 +; P8LE-NEXT: mulhdu r4, r4, r7 +; P8LE-NEXT: li r7, 98 +; P8LE-NEXT: mulhdu r5, r5, r7 +; P8LE-NEXT: li r7, 1003 +; P8LE-NEXT: mulhdu r6, r6, r7 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: mtvsrd f1, r4 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r6 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 ; P8LE-NEXT: vmrglh v2, v3, v2 ; P8LE-NEXT: vmrglh v3, v5, v4 -; P8LE-NEXT: vmrglw v2, v2, v3 +; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_1: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: lis r9, 16727 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: ori r9, r9, 2287 -; P8BE-NEXT: rldicl r5, r4, 16, 48 -; P8BE-NEXT: clrldi r6, r4, 48 -; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 -; P8BE-NEXT: rldicl r7, r4, 48, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 -; P8BE-NEXT: clrldi r8, r5, 32 +; P8BE-NEXT: lis r3, 65 +; P8BE-NEXT: mfvsrd r6, v2 +; P8BE-NEXT: lis r4, 668 +; P8BE-NEXT: lis r5, 528 +; P8BE-NEXT: ori r3, r3, 22280 +; P8BE-NEXT: ori r4, r4, 48148 +; P8BE-NEXT: ori r5, r5, 33825 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r7, r6, 48 +; P8BE-NEXT: oris r3, r3, 61158 +; P8BE-NEXT: sldi r4, r4, 32 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: mulld r3, r8, r3 -; P8BE-NEXT: lis r8, 21399 -; P8BE-NEXT: clrldi r10, r6, 32 -; P8BE-NEXT: ori r8, r8, 33437 -; P8BE-NEXT: clrldi r11, r7, 32 -; P8BE-NEXT: mulld r9, r10, r9 -; P8BE-NEXT: lis r10, 8456 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: mulld r8, r11, r8 -; P8BE-NEXT: ori r10, r10, 16913 -; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: mulld r10, r11, r10 -; P8BE-NEXT: subf r11, r3, r5 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: rldicl r9, r9, 24, 40 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: rldicl r8, r8, 27, 37 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r9, r9, 1003 -; P8BE-NEXT: rldicl r10, r10, 30, 34 -; P8BE-NEXT: mulli r8, r8, 98 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: mulli r10, r10, 124 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: subf r7, r8, r7 -; P8BE-NEXT: sldi r6, r6, 48 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: subf r4, r10, r4 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r5, r7, 48 +; P8BE-NEXT: ori r3, r3, 14506 +; P8BE-NEXT: rldicl r8, r6, 48, 48 +; P8BE-NEXT: oris r4, r4, 58848 +; P8BE-NEXT: mulld r3, r7, r3 +; P8BE-NEXT: lis r7, 689 +; P8BE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8BE-NEXT: ori r4, r4, 42800 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: ori r7, r7, 55878 +; P8BE-NEXT: mulld r4, r8, r4 +; P8BE-NEXT: rldicl r8, r6, 32, 48 +; P8BE-NEXT: oris r5, r5, 2114 +; P8BE-NEXT: sldi r7, r7, 32 +; P8BE-NEXT: rlwinm r8, r8, 0, 16, 31 +; P8BE-NEXT: ori r5, r5, 4229 +; P8BE-NEXT: rldicl r6, r6, 16, 48 +; P8BE-NEXT: oris r7, r7, 4139 +; P8BE-NEXT: mulld r5, r8, r5 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: ori r7, r7, 7589 +; P8BE-NEXT: mulld r6, r6, r7 +; P8BE-NEXT: li r7, 1003 +; P8BE-NEXT: mulhdu r3, r3, r7 +; P8BE-NEXT: li r7, 98 +; P8BE-NEXT: mulhdu r4, r4, r7 +; P8BE-NEXT: li r7, 124 +; P8BE-NEXT: mulhdu r5, r5, r7 +; P8BE-NEXT: li r7, 95 +; P8BE-NEXT: mulhdu r6, r6, r7 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: mtvsrd v5, r4 +; P8BE-NEXT: mtvsrd v2, r3 +; P8BE-NEXT: sldi r3, r4, 48 +; P8BE-NEXT: sldi r4, r5, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r3, r6, 48 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 -; P8BE-NEXT: vmrghh v3, v4, v5 +; P8BE-NEXT: vmrghh v3, v5, v4 ; P8BE-NEXT: vmrghw v2, v3, v2 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, @@ -240,61 +246,37 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r5, 95 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: mulhdu r3, r3, r5 ; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 @@ -305,63 +287,39 @@ ; ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 4139 +; P9BE-NEXT: ori r4, r4, 7589 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r5, 95 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r6, r4, r3 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r4, r6, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: mulhdu r3, r3, r5 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v2, v4 @@ -371,62 +329,34 @@ ; P8LE-LABEL: fold_urem_vec_2: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r4, 22765 -; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r4, r4, 8969 -; P8LE-NEXT: mfvsrd r5, f0 -; P8LE-NEXT: clrldi r3, r5, 48 -; P8LE-NEXT: rldicl r6, r5, 48, 48 -; P8LE-NEXT: rlwinm r8, r3, 0, 16, 31 -; P8LE-NEXT: rldicl r7, r5, 32, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r5, r5, 16, 48 -; P8LE-NEXT: clrldi r11, r8, 32 -; P8LE-NEXT: rlwinm r10, r7, 0, 16, 31 -; P8LE-NEXT: rlwinm r12, r5, 0, 16, 31 -; P8LE-NEXT: mulld r11, r11, r4 -; P8LE-NEXT: clrldi r0, r9, 32 -; P8LE-NEXT: clrldi r30, r10, 32 -; P8LE-NEXT: clrldi r29, r12, 32 -; P8LE-NEXT: mulld r0, r0, r4 -; P8LE-NEXT: mulld r30, r30, r4 -; P8LE-NEXT: mulld r4, r29, r4 -; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; P8LE-NEXT: rldicl r11, r11, 32, 32 -; P8LE-NEXT: subf r8, r11, r8 -; P8LE-NEXT: rldicl r0, r0, 32, 32 -; P8LE-NEXT: srwi r8, r8, 1 -; P8LE-NEXT: rldicl r30, r30, 32, 32 -; P8LE-NEXT: rldicl r4, r4, 32, 32 -; P8LE-NEXT: subf r9, r0, r9 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: subf r10, r30, r10 -; P8LE-NEXT: subf r11, r4, r12 -; P8LE-NEXT: srwi r9, r9, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: srwi r10, r10, 1 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: add r9, r9, r0 -; P8LE-NEXT: add r10, r10, r30 -; P8LE-NEXT: add r4, r11, r4 -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: srwi r4, r4, 6 -; P8LE-NEXT: mulli r9, r9, 95 -; P8LE-NEXT: mulli r10, r10, 95 -; P8LE-NEXT: mulli r4, r4, 95 -; P8LE-NEXT: subf r3, r8, r3 -; P8LE-NEXT: subf r6, r9, r6 -; P8LE-NEXT: mtvsrd f0, r3 -; P8LE-NEXT: subf r3, r10, r7 -; P8LE-NEXT: subf r4, r4, r5 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: rlwinm r7, r7, 0, 16, 31 +; P8LE-NEXT: rlwinm r4, r4, 0, 16, 31 +; P8LE-NEXT: mulld r5, r5, r3 +; P8LE-NEXT: mulld r6, r6, r3 +; P8LE-NEXT: mulld r7, r7, r3 +; P8LE-NEXT: mulld r3, r4, r3 +; P8LE-NEXT: li r4, 95 +; P8LE-NEXT: mulhdu r5, r5, r4 +; P8LE-NEXT: mulhdu r6, r6, r4 +; P8LE-NEXT: mulhdu r7, r7, r4 +; P8LE-NEXT: mulhdu r3, r3, r4 +; P8LE-NEXT: mtvsrd f0, r5 ; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: mtvsrd f2, r7 ; P8LE-NEXT: xxswapd v2, vs0 -; P8LE-NEXT: mtvsrd f3, r4 +; P8LE-NEXT: mtvsrd f3, r3 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 @@ -437,59 +367,35 @@ ; ; P8BE-LABEL: fold_urem_vec_2: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: rldicl r6, r4, 48, 48 ; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 ; P8BE-NEXT: rldicl r7, r4, 32, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 -; P8BE-NEXT: clrldi r8, r5, 32 ; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: mulld r5, r5, r3 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: clrldi r9, r6, 32 -; P8BE-NEXT: mulld r8, r8, r3 ; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: clrldi r10, r7, 32 -; P8BE-NEXT: mulld r9, r9, r3 -; P8BE-NEXT: clrldi r11, r4, 32 -; P8BE-NEXT: mulld r10, r10, r3 -; P8BE-NEXT: mulld r3, r11, r3 -; P8BE-NEXT: rldicl r8, r8, 32, 32 -; P8BE-NEXT: rldicl r9, r9, 32, 32 -; P8BE-NEXT: subf r11, r8, r5 -; P8BE-NEXT: rldicl r10, r10, 32, 32 -; P8BE-NEXT: subf r12, r9, r6 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: subf r11, r10, r7 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: subf r12, r3, r4 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: mulli r9, r9, 95 -; P8BE-NEXT: mulli r10, r10, 95 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: subf r5, r8, r5 -; P8BE-NEXT: subf r6, r9, r6 -; P8BE-NEXT: subf r7, r10, r7 -; P8BE-NEXT: subf r3, r3, r4 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: mulld r6, r6, r3 +; P8BE-NEXT: mulld r7, r7, r3 +; P8BE-NEXT: mulld r3, r4, r3 +; P8BE-NEXT: li r4, 95 +; P8BE-NEXT: mulhdu r5, r5, r4 +; P8BE-NEXT: mulhdu r6, r6, r4 +; P8BE-NEXT: mulhdu r7, r7, r4 +; P8BE-NEXT: mulhdu r3, r3, r4 +; P8BE-NEXT: sldi r4, r5, 48 +; P8BE-NEXT: sldi r5, r6, 48 +; P8BE-NEXT: mtvsrd v2, r4 ; P8BE-NEXT: sldi r4, r7, 48 -; P8BE-NEXT: mtvsrd v2, r5 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v3, r6 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: mtvsrd v4, r4 ; P8BE-NEXT: mtvsrd v5, r3 ; P8BE-NEXT: vmrghh v2, v3, v2 @@ -830,30 +736,27 @@ ; P9LE-NEXT: rlwinm r3, r3, 0, 27, 31 ; P9LE-NEXT: xxswapd v3, vs0 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: li r3, 4 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: lis r6, 22765 -; P9LE-NEXT: ori r6, r6, 8969 +; P9LE-NEXT: lis r4, 689 +; P9LE-NEXT: ori r4, r4, 55878 ; P9LE-NEXT: xxswapd v4, vs0 -; P9LE-NEXT: vmrglh v3, v4, v3 -; P9LE-NEXT: clrldi r5, r4, 32 -; P9LE-NEXT: mulld r5, r5, r6 -; P9LE-NEXT: rldicl r5, r5, 32, 32 -; P9LE-NEXT: subf r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 1 -; P9LE-NEXT: add r4, r4, r5 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: oris r4, r4, 4139 +; P9LE-NEXT: ori r4, r4, 7589 +; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 ; P9LE-NEXT: mtvsrd f0, r3 -; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: li r3, 6 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 95 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: vmrglh v3, v4, v3 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 -; P9LE-NEXT: vmrglh v2, v4, v2 +; P9LE-NEXT: vmrglh v2, v2, v4 ; P9LE-NEXT: vmrglw v2, v2, v3 ; P9LE-NEXT: blr ; @@ -869,100 +772,91 @@ ; P9BE-NEXT: rlwinm r3, r3, 0, 26, 31 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 6 +; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 22765 -; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: lis r4, 689 +; P9BE-NEXT: ori r4, r4, 55878 ; P9BE-NEXT: vmrghh v3, v4, v3 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: rldicl r4, r4, 32, 32 -; P9BE-NEXT: subf r5, r4, r3 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 -; P9BE-NEXT: mulli r4, r4, 95 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: sldi r4, r4, 32 +; P9BE-NEXT: oris r4, r4, 4139 +; P9BE-NEXT: ori r4, r4, 7589 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: rlwinm r3, r3, 0, 29, 31 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 95 +; P9BE-NEXT: mulhdu r3, r3, r4 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: mtvsrd v2, r3 -; P9BE-NEXT: vmrghh v2, v2, v4 +; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v3, v2 ; P9BE-NEXT: blr ; ; P8LE-LABEL: dont_fold_urem_power_of_two: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: li r6, 95 +; P8LE-NEXT: ori r3, r3, 55878 +; P8LE-NEXT: sldi r3, r3, 32 ; P8LE-NEXT: mfvsrd r4, f0 +; P8LE-NEXT: oris r3, r3, 4139 +; P8LE-NEXT: ori r3, r3, 7589 ; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rlwinm r6, r5, 0, 16, 31 -; P8LE-NEXT: clrldi r7, r6, 32 -; P8LE-NEXT: mulld r3, r7, r3 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: rlwinm r7, r7, 0, 27, 31 -; P8LE-NEXT: mtvsrd f1, r7 -; P8LE-NEXT: rldicl r3, r3, 32, 32 -; P8LE-NEXT: xxswapd v3, vs1 -; P8LE-NEXT: subf r6, r3, r6 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: mulld r3, r5, r3 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rlwinm r5, r5, 0, 26, 31 +; P8LE-NEXT: mtvsrd f0, r5 +; P8LE-NEXT: mulhdu r3, r3, r6 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: xxswapd v2, vs0 ; P8LE-NEXT: rldicl r4, r4, 32, 48 -; P8LE-NEXT: rlwinm r6, r6, 0, 26, 31 -; P8LE-NEXT: mulli r3, r3, 95 +; P8LE-NEXT: rlwinm r5, r6, 0, 27, 31 ; P8LE-NEXT: rlwinm r4, r4, 0, 29, 31 -; P8LE-NEXT: mtvsrd f0, r6 -; P8LE-NEXT: mtvsrd f3, r4 -; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f1, r5 +; P8LE-NEXT: mtvsrd f2, r4 +; P8LE-NEXT: mtvsrd f3, r3 +; P8LE-NEXT: xxswapd v3, vs1 +; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: xxswapd v5, vs3 -; P8LE-NEXT: subf r3, r3, r5 -; P8LE-NEXT: mtvsrd f2, r3 ; P8LE-NEXT: vmrglh v2, v3, v2 -; P8LE-NEXT: xxswapd v4, vs2 -; P8LE-NEXT: vmrglh v3, v4, v5 +; P8LE-NEXT: vmrglh v3, v5, v4 ; P8LE-NEXT: vmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_power_of_two: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: li r6, 95 +; P8BE-NEXT: ori r3, r3, 55878 +; P8BE-NEXT: sldi r3, r3, 32 ; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r7, r4, 16, 48 +; P8BE-NEXT: oris r3, r3, 4139 ; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 -; P8BE-NEXT: rlwinm r7, r7, 0, 26, 31 -; P8BE-NEXT: clrldi r6, r5, 32 -; P8BE-NEXT: mulld r3, r6, r3 -; P8BE-NEXT: rldicl r3, r3, 32, 32 -; P8BE-NEXT: subf r6, r3, r5 -; P8BE-NEXT: srwi r6, r6, 1 -; P8BE-NEXT: add r3, r6, r3 -; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: ori r3, r3, 7589 +; P8BE-NEXT: mulld r3, r5, r3 +; P8BE-NEXT: rldicl r5, r4, 32, 48 +; P8BE-NEXT: rlwinm r5, r5, 0, 27, 31 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v2, r5 +; P8BE-NEXT: mulhdu r3, r3, r6 +; P8BE-NEXT: rldicl r6, r4, 16, 48 ; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: rlwinm r6, r6, 0, 27, 31 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: rlwinm r6, r6, 0, 26, 31 ; P8BE-NEXT: rlwinm r4, r4, 0, 29, 31 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r6, r7, 48 +; P8BE-NEXT: sldi r5, r6, 48 ; P8BE-NEXT: sldi r4, r4, 48 -; P8BE-NEXT: mtvsrd v3, r6 -; P8BE-NEXT: mtvsrd v5, r4 -; P8BE-NEXT: subf r3, r3, r5 -; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: mtvsrd v3, r5 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r3 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: mtvsrd v5, r3 +; P8BE-NEXT: vmrghh v2, v3, v2 +; P8BE-NEXT: vmrghh v3, v4, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, @@ -973,37 +867,42 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; P9LE-LABEL: dont_fold_urem_one: ; P9LE: # %bb.0: +; P9LE-NEXT: lis r4, 2849 +; P9LE-NEXT: ori r4, r4, 25644 ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: li r5, 0 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: oris r6, r5, 45590 -; P9LE-NEXT: oris r5, r5, 51306 -; P9LE-NEXT: ori r6, r6, 17097 -; P9LE-NEXT: ori r5, r5, 30865 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r6 -; P9LE-NEXT: lis r6, 24749 -; P9LE-NEXT: ori r6, r6, 47143 -; P9LE-NEXT: rldicl r4, r4, 28, 36 -; P9LE-NEXT: mulli r4, r4, 23 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: sldi r4, r4, 32 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: oris r4, r4, 34192 +; P9LE-NEXT: ori r4, r4, 45591 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 23 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 12 +; P9LE-NEXT: ori r4, r4, 5559 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: oris r4, r4, 1244 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31 -; P9LE-NEXT: mulld r4, r4, r6 -; P9LE-NEXT: rldicl r4, r4, 21, 43 -; P9LE-NEXT: mulli r4, r4, 5423 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 48291 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 5423 +; P9LE-NEXT: mulhdu r3, r3, r4 +; P9LE-NEXT: lis r4, 100 +; P9LE-NEXT: ori r4, r4, 13628 ; P9LE-NEXT: xxswapd v3, vs0 +; P9LE-NEXT: sldi r4, r4, 32 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: oris r4, r4, 18438 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31 -; P9LE-NEXT: mulld r4, r4, r5 -; P9LE-NEXT: rldicl r4, r4, 24, 40 -; P9LE-NEXT: mulli r4, r4, 654 -; P9LE-NEXT: subf r3, r4, r3 +; P9LE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9LE-NEXT: ori r4, r4, 17236 +; P9LE-NEXT: mulld r3, r3, r4 +; P9LE-NEXT: li r4, 654 +; P9LE-NEXT: mulhdu r3, r3, r4 ; P9LE-NEXT: xxswapd v4, vs0 ; P9LE-NEXT: mtvsrd f0, r3 ; P9LE-NEXT: xxswapd v2, vs0 @@ -1015,47 +914,49 @@ ; ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: +; P9BE-NEXT: lis r4, 12 +; P9BE-NEXT: ori r4, r4, 5559 ; P9BE-NEXT: li r3, 6 ; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: lis r5, 24749 -; P9BE-NEXT: ori r5, r5, 47143 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r5 -; P9BE-NEXT: li r5, 0 -; P9BE-NEXT: oris r6, r5, 45590 -; P9BE-NEXT: oris r5, r5, 51306 -; P9BE-NEXT: ori r6, r6, 17097 -; P9BE-NEXT: ori r5, r5, 30865 -; P9BE-NEXT: rldicl r4, r4, 21, 43 -; P9BE-NEXT: mulli r4, r4, 5423 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: oris r4, r4, 1244 +; P9BE-NEXT: ori r4, r4, 48291 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 5423 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 2849 +; P9BE-NEXT: ori r4, r4, 25644 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 34192 ; P9BE-NEXT: mtvsrd v3, r3 ; P9BE-NEXT: li r3, 4 +; P9BE-NEXT: ori r4, r4, 45591 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 -; P9BE-NEXT: clrldi r4, r3, 32 -; P9BE-NEXT: mulld r4, r4, r6 -; P9BE-NEXT: rldicl r4, r4, 28, 36 -; P9BE-NEXT: mulli r4, r4, 23 -; P9BE-NEXT: subf r3, r4, r3 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 23 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: lis r4, 100 +; P9BE-NEXT: ori r4, r4, 13628 +; P9BE-NEXT: sldi r4, r4, 32 ; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: oris r4, r4, 18438 ; P9BE-NEXT: mtvsrd v4, r3 -; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9BE-NEXT: mulld r3, r3, r5 -; P9BE-NEXT: rldicl r3, r3, 24, 40 -; P9BE-NEXT: mulli r3, r3, 654 -; P9BE-NEXT: subf r3, r3, r4 -; P9BE-NEXT: sldi r3, r3, 48 -; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: li r3, 0 +; P9BE-NEXT: ori r4, r4, 17236 ; P9BE-NEXT: sldi r3, r3, 48 ; P9BE-NEXT: vmrghh v3, v4, v3 ; P9BE-NEXT: mtvsrd v4, r3 +; P9BE-NEXT: li r3, 2 +; P9BE-NEXT: vextuhlx r3, r3, v2 +; P9BE-NEXT: rlwinm r3, r3, 0, 16, 31 +; P9BE-NEXT: mulld r3, r3, r4 +; P9BE-NEXT: li r4, 654 +; P9BE-NEXT: mulhdu r3, r3, r4 +; P9BE-NEXT: sldi r3, r3, 48 +; P9BE-NEXT: mtvsrd v2, r3 ; P9BE-NEXT: vmrghh v2, v4, v2 ; P9BE-NEXT: vmrghw v2, v2, v3 ; P9BE-NEXT: blr @@ -1063,37 +964,42 @@ ; P8LE-LABEL: dont_fold_urem_one: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: li r3, 0 -; P8LE-NEXT: lis r8, 24749 +; P8LE-NEXT: lis r3, 2849 +; P8LE-NEXT: lis r4, 12 +; P8LE-NEXT: lis r7, 100 ; P8LE-NEXT: xxlxor v5, v5, v5 -; P8LE-NEXT: oris r5, r3, 45590 -; P8LE-NEXT: ori r8, r8, 47143 -; P8LE-NEXT: oris r3, r3, 51306 -; P8LE-NEXT: ori r5, r5, 17097 -; P8LE-NEXT: ori r3, r3, 30865 -; P8LE-NEXT: mfvsrd r4, f0 -; P8LE-NEXT: rldicl r6, r4, 32, 48 -; P8LE-NEXT: rldicl r7, r4, 16, 48 -; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: mulld r5, r9, r5 -; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31 -; P8LE-NEXT: mulld r8, r9, r8 -; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31 -; P8LE-NEXT: mulld r3, r9, r3 -; P8LE-NEXT: rldicl r5, r5, 28, 36 -; P8LE-NEXT: rldicl r8, r8, 21, 43 -; P8LE-NEXT: mulli r5, r5, 23 -; P8LE-NEXT: rldicl r3, r3, 24, 40 -; P8LE-NEXT: mulli r8, r8, 5423 -; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: subf r5, r5, r6 -; P8LE-NEXT: subf r6, r8, r7 -; P8LE-NEXT: mtvsrd f0, r5 -; P8LE-NEXT: subf r3, r3, r4 -; P8LE-NEXT: mtvsrd f1, r6 -; P8LE-NEXT: mtvsrd f2, r3 +; P8LE-NEXT: ori r3, r3, 25644 +; P8LE-NEXT: ori r4, r4, 5559 +; P8LE-NEXT: ori r7, r7, 13628 +; P8LE-NEXT: sldi r3, r3, 32 +; P8LE-NEXT: sldi r4, r4, 32 +; P8LE-NEXT: mfvsrd r5, f0 +; P8LE-NEXT: oris r3, r3, 34192 +; P8LE-NEXT: oris r4, r4, 1244 +; P8LE-NEXT: ori r3, r3, 45591 +; P8LE-NEXT: ori r4, r4, 48291 +; P8LE-NEXT: rldicl r6, r5, 32, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: mulld r3, r6, r3 +; P8LE-NEXT: rldicl r6, r5, 16, 48 +; P8LE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8LE-NEXT: rldicl r5, r5, 48, 48 +; P8LE-NEXT: mulld r4, r6, r4 +; P8LE-NEXT: sldi r6, r7, 32 +; P8LE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8LE-NEXT: oris r6, r6, 18438 +; P8LE-NEXT: ori r6, r6, 17236 +; P8LE-NEXT: mulld r5, r5, r6 +; P8LE-NEXT: li r6, 23 +; P8LE-NEXT: mulhdu r3, r3, r6 +; P8LE-NEXT: li r6, 5423 +; P8LE-NEXT: mulhdu r4, r4, r6 +; P8LE-NEXT: li r6, 654 +; P8LE-NEXT: mulhdu r5, r5, r6 +; P8LE-NEXT: mtvsrd f0, r3 +; P8LE-NEXT: mtvsrd f1, r4 ; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: mtvsrd f2, r5 ; P8LE-NEXT: xxswapd v3, vs1 ; P8LE-NEXT: xxswapd v4, vs2 ; P8LE-NEXT: vmrglh v2, v3, v2 @@ -1103,46 +1009,48 @@ ; ; P8BE-LABEL: dont_fold_urem_one: ; P8BE: # %bb.0: +; P8BE-NEXT: lis r3, 12 ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: li r3, 0 -; P8BE-NEXT: lis r8, 24749 -; P8BE-NEXT: oris r6, r3, 51306 -; P8BE-NEXT: ori r8, r8, 47143 -; P8BE-NEXT: oris r3, r3, 45590 -; P8BE-NEXT: rldicl r5, r4, 32, 48 -; P8BE-NEXT: clrldi r7, r4, 48 -; P8BE-NEXT: ori r6, r6, 30865 -; P8BE-NEXT: ori r3, r3, 17097 -; P8BE-NEXT: rldicl r4, r4, 48, 48 -; P8BE-NEXT: rlwinm r9, r5, 31, 17, 31 +; P8BE-NEXT: lis r5, 2849 +; P8BE-NEXT: ori r3, r3, 5559 +; P8BE-NEXT: ori r5, r5, 25644 +; P8BE-NEXT: sldi r3, r3, 32 +; P8BE-NEXT: clrldi r6, r4, 48 +; P8BE-NEXT: oris r3, r3, 1244 +; P8BE-NEXT: rlwinm r6, r6, 0, 16, 31 +; P8BE-NEXT: ori r3, r3, 48291 +; P8BE-NEXT: sldi r5, r5, 32 +; P8BE-NEXT: mulld r3, r6, r3 +; P8BE-NEXT: lis r6, 100 +; P8BE-NEXT: oris r5, r5, 34192 +; P8BE-NEXT: ori r6, r6, 13628 +; P8BE-NEXT: rldicl r7, r4, 48, 48 +; P8BE-NEXT: ori r5, r5, 45591 +; P8BE-NEXT: sldi r6, r6, 32 ; P8BE-NEXT: rlwinm r7, r7, 0, 16, 31 -; P8BE-NEXT: rlwinm r5, r5, 0, 16, 31 +; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: oris r6, r6, 18438 +; P8BE-NEXT: mulld r5, r7, r5 ; P8BE-NEXT: rlwinm r4, r4, 0, 16, 31 -; P8BE-NEXT: mulld r6, r9, r6 -; P8BE-NEXT: clrldi r9, r7, 32 -; P8BE-NEXT: mulld r8, r9, r8 -; P8BE-NEXT: clrldi r9, r4, 32 -; P8BE-NEXT: mulld r3, r9, r3 -; P8BE-NEXT: li r9, 0 -; P8BE-NEXT: rldicl r6, r6, 24, 40 -; P8BE-NEXT: mulli r6, r6, 654 -; P8BE-NEXT: rldicl r8, r8, 21, 43 -; P8BE-NEXT: rldicl r3, r3, 28, 36 -; P8BE-NEXT: mulli r8, r8, 5423 -; P8BE-NEXT: mulli r3, r3, 23 -; P8BE-NEXT: subf r5, r6, r5 -; P8BE-NEXT: sldi r6, r9, 48 -; P8BE-NEXT: mtvsrd v2, r6 -; P8BE-NEXT: sldi r5, r5, 48 -; P8BE-NEXT: subf r6, r8, r7 -; P8BE-NEXT: mtvsrd v3, r5 -; P8BE-NEXT: subf r3, r3, r4 -; P8BE-NEXT: sldi r4, r6, 48 +; P8BE-NEXT: ori r6, r6, 17236 +; P8BE-NEXT: mulld r4, r4, r6 +; P8BE-NEXT: li r6, 5423 +; P8BE-NEXT: mulhdu r3, r3, r6 +; P8BE-NEXT: li r6, 23 +; P8BE-NEXT: mulhdu r5, r5, r6 +; P8BE-NEXT: li r6, 654 +; P8BE-NEXT: mulhdu r4, r4, r6 +; P8BE-NEXT: li r6, 0 ; P8BE-NEXT: sldi r3, r3, 48 -; P8BE-NEXT: mtvsrd v4, r4 +; P8BE-NEXT: sldi r6, r6, 48 +; P8BE-NEXT: mtvsrd v3, r3 +; P8BE-NEXT: sldi r5, r5, 48 +; P8BE-NEXT: mtvsrd v2, r6 +; P8BE-NEXT: sldi r3, r4, 48 +; P8BE-NEXT: mtvsrd v4, r5 ; P8BE-NEXT: mtvsrd v5, r3 -; P8BE-NEXT: vmrghh v2, v2, v3 -; P8BE-NEXT: vmrghh v3, v5, v4 +; P8BE-NEXT: vmrghh v3, v4, v3 +; P8BE-NEXT: vmrghh v2, v2, v5 ; P8BE-NEXT: vmrghw v2, v2, v3 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, Index: llvm/test/CodeGen/X86/load-scalar-as-vector.ll =================================================================== --- llvm/test/CodeGen/X86/load-scalar-as-vector.ll +++ llvm/test/CodeGen/X86/load-scalar-as-vector.ll @@ -520,29 +520,21 @@ define <16 x i8> @urem_op1_constant(i8* %p) nounwind { ; SSE-LABEL: urem_op1_constant: ; SSE: # %bb.0: -; SSE-NEXT: movb (%rdi), %al -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrb %cl -; SSE-NEXT: movzbl %cl, %ecx -; SSE-NEXT: imull $49, %ecx, %ecx -; SSE-NEXT: shrl $10, %ecx -; SSE-NEXT: imull $42, %ecx, %ecx -; SSE-NEXT: subb %cl, %al -; SSE-NEXT: movzbl %al, %eax +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; SSE-NEXT: movzwl %ax, %eax +; SSE-NEXT: imull $42, %eax, %eax +; SSE-NEXT: shrl $16, %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: urem_op1_constant: ; AVX: # %bb.0: -; AVX-NEXT: movb (%rdi), %al -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrb %cl -; AVX-NEXT: movzbl %cl, %ecx -; AVX-NEXT: imull $49, %ecx, %ecx -; AVX-NEXT: shrl $10, %ecx -; AVX-NEXT: imull $42, %ecx, %ecx -; AVX-NEXT: subb %cl, %al -; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: movzbl (%rdi), %eax +; AVX-NEXT: imull $1561, %eax, %eax # imm = 0x619 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: imull $42, %eax, %eax +; AVX-NEXT: shrl $16, %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %x = load i8, i8* %p Index: llvm/test/CodeGen/X86/urem-i8-constant.ll =================================================================== --- llvm/test/CodeGen/X86/urem-i8-constant.ll +++ llvm/test/CodeGen/X86/urem-i8-constant.ll @@ -7,11 +7,11 @@ ; CHECK-LABEL: foo: ; CHECK: # %bb.0: ; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: imull $111, %eax, %ecx -; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,8), %edx -; CHECK-NEXT: leal (%ecx,%edx,4), %ecx -; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: imull $1772, %eax, %eax # imm = 0x6EC +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: leal (%eax,%eax,8), %ecx +; CHECK-NEXT: leal (%eax,%ecx,4), %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl %t546 = urem i8 %tmp325, 37 Index: llvm/test/CodeGen/X86/urem-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-lkk.ll +++ llvm/test/CodeGen/X86/urem-lkk.ll @@ -4,17 +4,13 @@ define i32 @fold_urem_positive_odd(i32 %x) { ; CHECK-LABEL: fold_urem_positive_odd: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: imulq $1491936009, %rcx, %rcx # imm = 0x58ED2309 -; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: subl %ecx, %edx -; CHECK-NEXT: shrl %edx -; CHECK-NEXT: addl %ecx, %edx -; CHECK-NEXT: shrl $6, %edx -; CHECK-NEXT: imull $95, %edx, %ecx -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: movabsq $194176253407468965, %rax # imm = 0x2B1DA46102B1DA5 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $95, %ecx +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq %1 = urem i32 %x, 95 ret i32 %1 @@ -24,13 +20,13 @@ define i32 @fold_urem_positive_even(i32 %x) { ; CHECK-LABEL: fold_urem_positive_even: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: movl $4149100483, %edx # imm = 0xF74E3FC3 -; CHECK-NEXT: imulq %rcx, %rdx -; CHECK-NEXT: shrq $42, %rdx -; CHECK-NEXT: imull $1060, %edx, %ecx # imm = 0x424 -; CHECK-NEXT: subl %ecx, %eax +; CHECK-NEXT: movabsq $17402588748782596, %rax # imm = 0x3DD38FF08B1C04 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movl $1060, %ecx # imm = 0x424 +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq %1 = urem i32 %x, 1060 ret i32 %1 Index: llvm/test/CodeGen/X86/urem-vector-lkk.ll =================================================================== --- llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -7,39 +7,24 @@ ; SSE-LABEL: fold_urem_vec_1: ; SSE: # %bb.0: ; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; SSE-NEXT: shrl $19, %ecx -; SSE-NEXT: imull $124, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %edx -; SSE-NEXT: imull $95, %edx, %edx -; SSE-NEXT: subl %edx, %ecx +; SSE-NEXT: imull $34636834, %eax, %eax # imm = 0x2108422 +; SSE-NEXT: imulq $124, %rax, %rax +; SSE-NEXT: shrq $32, %rax +; SSE-NEXT: pextrw $0, %xmm0, %ecx +; SSE-NEXT: imull $45210183, %ecx, %ecx # imm = 0x2B1DA47 +; SSE-NEXT: imulq $95, %rcx, %rcx +; SSE-NEXT: shrq $32, %rcx ; SSE-NEXT: movd %ecx, %xmm1 ; SSE-NEXT: pinsrw $1, %eax, %xmm1 ; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; SSE-NEXT: shrl $17, %ecx -; SSE-NEXT: imull $98, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $43826197, %eax, %eax # imm = 0x29CBC15 +; SSE-NEXT: imulq $98, %rax, %rax +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $9, %edx -; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $4282121, %eax, %eax # imm = 0x415709 +; SSE-NEXT: imulq $1003, %rax, %rax # imm = 0x3EB +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -47,39 +32,24 @@ ; AVX-LABEL: fold_urem_vec_1: ; AVX: # %bb.0: ; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; AVX-NEXT: shrl $19, %ecx -; AVX-NEXT: imull $124, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %edx -; AVX-NEXT: imull $95, %edx, %edx -; AVX-NEXT: subl %edx, %ecx +; AVX-NEXT: imull $34636834, %eax, %eax # imm = 0x2108422 +; AVX-NEXT: imulq $124, %rax, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: imull $45210183, %ecx, %ecx # imm = 0x2B1DA47 +; AVX-NEXT: imulq $95, %rcx, %rcx +; AVX-NEXT: shrq $32, %rcx ; AVX-NEXT: vmovd %ecx, %xmm1 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX-NEXT: shrl $17, %ecx -; AVX-NEXT: imull $98, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $43826197, %eax, %eax # imm = 0x29CBC15 +; AVX-NEXT: imulq $98, %rax, %rax +; AVX-NEXT: shrq $32, %rax ; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $9, %edx -; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $4282121, %eax, %eax # imm = 0x415709 +; AVX-NEXT: imulq $1003, %rax, %rax # imm = 0x3EB +; AVX-NEXT: shrq $32, %rax ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, @@ -96,13 +66,47 @@ ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_2: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: fold_urem_vec_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [45210183,45210183,45210183,45210183] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [95,95,95,95] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: fold_urem_vec_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [45210183,45210183,45210183,45210183,45210183,45210183,45210183,45210183] +; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [95,95,95,95,95,95,95,95] +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -139,41 +143,39 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_power_of_two: ; SSE: # %bb.0: +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: andl $31, %eax +; SSE-NEXT: movd %xmm0, %ecx +; SSE-NEXT: andl $63, %ecx +; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pinsrw $1, %eax, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: andl $7, %eax +; SSE-NEXT: pinsrw $2, %eax, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %ecx -; SSE-NEXT: imull $95, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: andl $31, %ecx -; SSE-NEXT: movd %xmm0, %edx -; SSE-NEXT: andl $63, %edx -; SSE-NEXT: movd %edx, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: andl $7, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE-NEXT: imull $45210183, %eax, %eax # imm = 0x2B1DA47 +; SSE-NEXT: imulq $95, %rax, %rax +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_power_of_two: ; AVX: # %bb.0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: andl $31, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: andl $63, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: andl $7, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %ecx -; AVX-NEXT: imull $95, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpextrw $1, %xmm0, %ecx -; AVX-NEXT: andl $31, %ecx -; AVX-NEXT: vmovd %xmm0, %edx -; AVX-NEXT: andl $63, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %ecx -; AVX-NEXT: andl $7, %ecx -; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: imull $45210183, %eax, %eax # imm = 0x2B1DA47 +; AVX-NEXT: imulq $95, %rax, %rax +; AVX-NEXT: shrq $32, %rax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -183,64 +185,46 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; SSE-LABEL: dont_fold_urem_one: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $4, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx ; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $6567229, %eax, %eax # imm = 0x64353D +; SSE-NEXT: imulq $654, %rax, %rax # imm = 0x28E +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: imull $186737709, %eax, %eax # imm = 0xB21642D +; SSE-NEXT: leaq (%rax,%rax,2), %rcx +; SSE-NEXT: shlq $3, %rcx +; SSE-NEXT: subq %rax, %rcx +; SSE-NEXT: shrq $32, %rcx +; SSE-NEXT: pinsrw $2, %ecx, %xmm1 ; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; SSE-NEXT: shrl $26, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax +; SSE-NEXT: imull $791992, %eax, %eax # imm = 0xC15B8 +; SSE-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; SSE-NEXT: shrq $32, %rax ; SSE-NEXT: pinsrw $3, %eax, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: dont_fold_urem_one: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $4, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx ; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX-NEXT: shrl $25, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $6567229, %eax, %eax # imm = 0x64353D +; AVX-NEXT: imulq $654, %rax, %rax # imm = 0x28E +; AVX-NEXT: shrq $32, %rax ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: imull $186737709, %eax, %eax # imm = 0xB21642D +; AVX-NEXT: leaq (%rax,%rax,2), %rcx +; AVX-NEXT: shlq $3, %rcx +; AVX-NEXT: subq %rax, %rcx +; AVX-NEXT: shrq $32, %rcx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 ; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX-NEXT: shrl $26, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: imull $791992, %eax, %eax # imm = 0xC15B8 +; AVX-NEXT: imulq $5423, %rax, %rax # imm = 0x152F +; AVX-NEXT: shrq $32, %rax ; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, @@ -375,4 +359,4 @@ ; AVX2-NEXT: retq %1 = urem <4 x i64> %x, ret <4 x i64> %1 -} \ No newline at end of file +} Index: llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -616,16 +616,63 @@ ; SSE-NEXT: psubw %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_rem7_8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_rem7_8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7] +; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2NOBW-LABEL: test_rem7_8i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2NOBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2NOBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2NOBW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2NOBW-NEXT: vzeroupper +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = urem <8 x i16> %a, ret <8 x i16> %res } @@ -690,61 +737,37 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpmulhuw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_rem7_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, @@ -852,93 +875,36 @@ ; AVX1-LABEL: test_remconstant_16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpsllw $7, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7] -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7] -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2NOBW-LABEL: test_remconstant_16i8: ; AVX2NOBW: # %bb.0: -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vzeroupper ; AVX2NOBW-NEXT: retq ; ; AVX512BW-LABEL: test_remconstant_16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,1,3,3,3,2,2,2,2,3,3,3,1,1,2] -; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %res = urem <16 x i8> %a, Index: llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -561,16 +561,29 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_16i16: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512BW-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm0 +; AVX512BW-NEXT: retq %res = urem <16 x i16> %a, ret <16 x i16> %res } @@ -646,20 +659,10 @@ ; ; AVX512BW-LABEL: test_rem7_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vpsllw $3, %ymm1, %ymm2 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq %res = urem <32 x i8> %a, ret <32 x i8> %res @@ -791,23 +794,10 @@ ; ; AVX512BW-LABEL: test_remconstant_32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 -; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: retq %res = urem <32 x i8> %a, ret <32 x i8> %res Index: llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll =================================================================== --- llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -440,22 +440,23 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 -; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512F-NEXT: vpmulld %zmm2, %zmm1, %zmm1 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm5 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] +; AVX512F-NEXT: vpermt2d %zmm1, %zmm5, %zmm4 +; AVX512F-NEXT: vpmovdw %zmm4, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpmulld %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm2 +; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vpermt2d %zmm0, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; Index: llvm/test/CodeGen/X86/vector-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/vector-intrinsics.ll +++ llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | grep call | count 43 declare <4 x double> @llvm.sin.v4f64(<4 x double> %p) Index: llvm/test/CodeGen/X86/vector-rem.ll =================================================================== --- llvm/test/CodeGen/X86/vector-rem.ll +++ llvm/test/CodeGen/X86/vector-rem.ll @@ -81,30 +81,30 @@ ; CHECK-LABEL: qux: ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf -; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; CHECK-NEXT: callq fmodf -; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] Index: llvm/test/CodeGen/X86/vector-truncate-combine.ll =================================================================== --- llvm/test/CodeGen/X86/vector-truncate-combine.ll +++ llvm/test/CodeGen/X86/vector-truncate-combine.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-- -O2 -start-after=stack-protector -stop-before=loops %s -o - | FileCheck %s ; This test verifies the fix for PR33368. Index: llvm/test/CodeGen/X86/vector-variable-idx.ll =================================================================== --- llvm/test/CodeGen/X86/vector-variable-idx.ll +++ llvm/test/CodeGen/X86/vector-variable-idx.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | grep movss | count 2 ; PR2676 Index: llvm/test/CodeGen/X86/vector-variable-idx2.ll =================================================================== --- llvm/test/CodeGen/X86/vector-variable-idx2.ll +++ llvm/test/CodeGen/X86/vector-variable-idx2.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mattr=+sse4.1 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" Index: llvm/test/CodeGen/X86/vector-width-store-merge.ll =================================================================== --- llvm/test/CodeGen/X86/vector-width-store-merge.ll +++ llvm/test/CodeGen/X86/vector-width-store-merge.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s ; This tests whether or not we generate vectors large than preferred vector width when @@ -5,40 +6,60 @@ ; Function Attrs: nounwind uwtable define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 { +; CHECK-LABEL: A: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups 16(%rdi), %xmm1 +; CHECK-NEXT: vmovups %xmm1, 16(%rsi) +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: retq entry: -; CHECK: A -; CHECK-NOT: vmovups %ymm -; CHECK: vmovups %xmm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) ret void } ; Function Attrs: nounwind uwtable define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 { +; CHECK-LABEL: B: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups 16(%rdi), %xmm1 +; CHECK-NEXT: vmovups 32(%rdi), %xmm2 +; CHECK-NEXT: vmovups 48(%rdi), %xmm3 +; CHECK-NEXT: vmovups %xmm3, 48(%rsi) +; CHECK-NEXT: vmovups %xmm2, 32(%rsi) +; CHECK-NEXT: vmovups %xmm1, 16(%rsi) +; CHECK-NEXT: vmovups %xmm0, (%rsi) +; CHECK-NEXT: retq entry: -; CHECK: B -; CHECK-NOT: vmovups %zmm -; CHECK: vmovups %xmm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) ret void } ; Function Attrs: nounwind uwtable define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 { +; CHECK-LABEL: C: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: -; CHECK: C -; CHECK-NOT: vmovups %ymm -; CHECK: vmovups %ymm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false) ret void } ; Function Attrs: nounwind uwtable define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 { +; CHECK-LABEL: D: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovups (%rdi), %ymm0 +; CHECK-NEXT: vmovups 32(%rdi), %ymm1 +; CHECK-NEXT: vmovups %ymm1, 32(%rsi) +; CHECK-NEXT: vmovups %ymm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: -; CHECK: D -; CHECK-NOT: vmovups %zmm -; CHECK: vmovups %ymm call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false) ret void }