diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -527,6 +527,13 @@ "HasFastVariablePerLaneShuffle", "true", "Per-lane shuffles with variable masks are fast">; +// Prefer lowering shuffles on AVX512 targets (e.g. Skylake Server) to +// imm shifts/rotate if they can use more ports than regular shuffles. +def TuningPreferShiftShuffle : SubtargetFeature<"faster-shift-than-shuffle", + "PreferLowerShuffleAsShift", "true", + "Shifts are faster (or as fast) as shuffle">; + + // On some X86 processors, a vzeroupper instruction should be inserted after // using ymm/zmm registers before executing code that may use SSE instructions. def TuningInsertVZEROUPPER @@ -840,7 +847,8 @@ TuningPrefer256Bit, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningPreferShiftShuffle]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14099,7 +14099,7 @@ SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { + SelectionDAG &DAG, bool BitwiseOnly) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -14118,6 +14118,9 @@ V = V2; } + if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ)) + return SDValue(); + if (ShiftAmt < 0) return SDValue(); @@ -15285,8 +15288,9 @@ return Extract; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do @@ -15560,6 +15564,18 @@ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); + // Try to use shift instructions if fast. + if (Subtarget.hasFasterShiftThanShuffle()) { + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ true)) + return Shift; + if (NumV2Elements == 0) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) + return Rotate; + } + if (NumV2Elements == 0) { // Try to use broadcast unless the mask only has one non-undef element. if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { @@ -15589,9 +15605,14 @@ return Extract; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; + if (!Subtarget.hasFasterShiftThanShuffle() && NumV2Elements == 0) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) + return Rotate; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) @@ -16251,8 +16272,9 @@ if (NumV2Inputs == 0) { // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Check for being able to broadcast a single element. @@ -16290,8 +16312,9 @@ "shuffles."); // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // See if we can use SSE4A Extraction / Insertion. @@ -16502,8 +16525,9 @@ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. @@ -16545,6 +16569,7 @@ Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit rotation instructions. if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, Subtarget, DAG)) @@ -18307,6 +18332,13 @@ Subtarget, DAG)) return Broadcast; + // Try to use shift instructions if fast. + if (Subtarget.hasFasterShiftThanShuffle()) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ true)) + return Shift; + if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on both lanes. @@ -18328,8 +18360,9 @@ } // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // If we have VLX support, we can use VALIGN or VEXPAND. @@ -18522,6 +18555,8 @@ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; }); + // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. @@ -18553,6 +18588,18 @@ Subtarget, DAG)) return Broadcast; + // Try to use shift instructions if fast. + if (Subtarget.hasFasterShiftThanShuffle()) { + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ true)) + return Shift; + if (NumV2Elements == 0) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) + return Rotate; + } + // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the two 128-bit // lanes. @@ -18571,10 +18618,16 @@ } // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; + if (!Subtarget.hasFasterShiftThanShuffle() && NumV2Elements == 0) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) + return Rotate; + // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask, @@ -18675,8 +18728,9 @@ return V; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. @@ -18797,8 +18851,9 @@ return V; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. @@ -19187,6 +19242,13 @@ assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // Try to use shift instructions if fast. + if (Subtarget.hasFasterShiftThanShuffle()) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ true)) + return Shift; + if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four @@ -19213,8 +19275,9 @@ return Shuf128; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use VALIGN. @@ -19252,6 +19315,8 @@ assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); + // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. @@ -19259,6 +19324,18 @@ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; + // Try to use shift instructions if fast. + if (Subtarget.hasFasterShiftThanShuffle()) { + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ true)) + return Shift; + if (NumV2Elements == 0) + if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, + Subtarget, DAG)) + return Rotate; + } + // If the shuffle mask is repeated in each 128-bit lane we can use more // efficient instructions that mirror the shuffles across the four 128-bit // lanes. @@ -19277,10 +19354,16 @@ } // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; + if (!Subtarget.hasFasterShiftThanShuffle() && NumV2Elements != 0) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG)) + return Rotate; + // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) @@ -19347,8 +19430,9 @@ return V; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, + Subtarget, DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. @@ -19410,8 +19494,9 @@ return V; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + if (SDValue Shift = + lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, + DAG, /*BitwiseOnly*/ false)) return Shift; // Try to use byte rotation instructions. @@ -38659,85 +38744,95 @@ } } - // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. - // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we - // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && - !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { - // Narrow the repeated mask to create 32-bit element permutes. - SmallVector WordMask = RepeatedMask; - if (MaskScalarSizeInBits == 64) - narrowShuffleMaskElts(2, RepeatedMask, WordMask); - - Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); - ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); - PermuteImm = getV4X86ShuffleImm(WordMask); - return true; - } - } - - // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. - if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && - ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || - (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { - ArrayRef LoMask(RepeatedMask.data() + 0, 4); - ArrayRef HiMask(RepeatedMask.data() + 4, 4); - - // PSHUFLW: permute lower 4 elements only. - if (isUndefOrInRange(LoMask, 0, 4) && - isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { - Shuffle = X86ISD::PSHUFLW; - ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); - PermuteImm = getV4X86ShuffleImm(LoMask); - return true; + // We are checking for shuffle match or shift match. Loop twice so we can + // order which we try and match first depending on target preference. + for (unsigned Order = 0; Order < 2; ++Order) { + if (Subtarget.hasFasterShiftThanShuffle() ? (Order == 1) : (Order == 0)) { + // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && + !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + // Narrow the repeated mask to create 32-bit element permutes. + SmallVector WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + narrowShuffleMaskElts(2, RepeatedMask, WordMask); + + Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); + ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; + } } - // PSHUFHW: permute upper 4 elements only. - if (isUndefOrInRange(HiMask, 4, 8) && - isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { - // Offset the HiMask so that we can create the shuffle immediate. - int OffsetHiMask[4]; - for (int i = 0; i != 4; ++i) - OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); + // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + ArrayRef LoMask(RepeatedMask.data() + 0, 4); + ArrayRef HiMask(RepeatedMask.data() + 4, 4); + + // PSHUFLW: permute lower 4 elements only. + if (isUndefOrInRange(LoMask, 0, 4) && + isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { + Shuffle = X86ISD::PSHUFLW; + ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); + PermuteImm = getV4X86ShuffleImm(LoMask); + return true; + } - Shuffle = X86ISD::PSHUFHW; - ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); - PermuteImm = getV4X86ShuffleImm(OffsetHiMask); - return true; + // PSHUFHW: permute upper 4 elements only. + if (isUndefOrInRange(HiMask, 4, 8) && + isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { + // Offset the HiMask so that we can create the shuffle immediate. + int OffsetHiMask[4]; + for (int i = 0; i != 4; ++i) + OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); + + Shuffle = X86ISD::PSHUFHW; + ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); + PermuteImm = getV4X86ShuffleImm(OffsetHiMask); + return true; + } + } + } + } else { + // Attempt to match against bit rotates. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && + ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || + Subtarget.hasAVX512())) { + int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, + Subtarget, Mask); + if (0 < RotateAmt) { + Shuffle = X86ISD::VROTLI; + PermuteImm = (unsigned)RotateAmt; + return true; + } } } - } - - // Attempt to match against byte/bit shifts. - if (AllowIntDomain && - ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || - (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, - Mask, 0, Zeroable, Subtarget); - if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || - 32 <= ShuffleVT.getScalarSizeInBits())) { - PermuteImm = (unsigned)ShiftAmt; - return true; - } - } - - // Attempt to match against bit rotates. - if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && - ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || - Subtarget.hasAVX512())) { - int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, - Subtarget, Mask); - if (0 < RotateAmt) { - Shuffle = X86ISD::VROTLI; - PermuteImm = (unsigned)RotateAmt; - return true; + // Attempt to match against byte/bit shifts. + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { + int ShiftAmt = + matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, + Zeroable, Subtarget); + // Byte shifts can be slower so only match them on second attempt. + if (Order == 0 && + (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ)) + continue; + if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || + 32 <= ShuffleVT.getScalarSizeInBits())) { + PermuteImm = (unsigned)ShiftAmt; + return true; + } } } diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -249,6 +249,7 @@ return hasBWI() && canExtendTo512DQ(); } + bool hasFasterShiftThanShuffle() const { return PreferLowerShuffleAsShift; } // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -88,6 +88,7 @@ X86::TuningInsertVZEROUPPER, X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts, + X86::TuningPreferShiftShuffle, // Perf-tuning flags. X86::TuningFastGather, diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -16,7 +16,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper @@ -43,7 +43,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-VBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit | FileCheck %s --check-prefixes=CHECK,CHECK-SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 -mattr=prefer-256-bit,avx512vbmi | FileCheck %s --check-prefixes=CHECK,CHECK-SKX,CHECK-SKX-VBMI ; Make sure CPUs default to prefer-256-bit. avx512vnni isn't interesting as it just adds an isel peephole for vpmaddwd+vpaddd ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-avx512vnni -mcpu=cascadelake | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 @@ -170,40 +170,110 @@ } define dso_local i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" { -; CHECK-LABEL: _Z9test_charPcS_i_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB8_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB8_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: _Z9test_charPcS_i_256: +; CHECK-SKX: # %bb.0: # %entry +; CHECK-SKX-NEXT: movl %edx, %eax +; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: xorl %ecx, %ecx +; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-SKX-NEXT: .p2align 4, 0x90 +; CHECK-SKX-NEXT: .LBB8_1: # %vector.body +; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-SKX-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 +; CHECK-SKX-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-SKX-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-SKX-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-SKX-NEXT: addq $32, %rcx +; CHECK-SKX-NEXT: cmpq %rcx, %rax +; CHECK-SKX-NEXT: jne .LBB8_1 +; CHECK-SKX-NEXT: # %bb.2: # %middle.block +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovd %xmm0, %eax +; CHECK-SKX-NEXT: vzeroupper +; CHECK-SKX-NEXT: retq +; +; CHECK-AVX512-LABEL: _Z9test_charPcS_i_256: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %edx, %eax +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: xorl %ecx, %ecx +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 +; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-AVX512-NEXT: addq $32, %rcx +; CHECK-AVX512-NEXT: cmpq %rcx, %rax +; CHECK-AVX512-NEXT: jne .LBB8_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: _Z9test_charPcS_i_256: +; CHECK-VBMI: # %bb.0: # %entry +; CHECK-VBMI-NEXT: movl %edx, %eax +; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: xorl %ecx, %ecx +; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-VBMI-NEXT: .p2align 4, 0x90 +; CHECK-VBMI-NEXT: .LBB8_1: # %vector.body +; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-VBMI-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 +; CHECK-VBMI-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-VBMI-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-VBMI-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-VBMI-NEXT: addq $32, %rcx +; CHECK-VBMI-NEXT: cmpq %rcx, %rax +; CHECK-VBMI-NEXT: jne .LBB8_1 +; CHECK-VBMI-NEXT: # %bb.2: # %middle.block +; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vmovd %xmm0, %eax +; CHECK-VBMI-NEXT: vzeroupper +; CHECK-VBMI-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -241,35 +311,95 @@ } define dso_local i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" { -; CHECK-LABEL: _Z9test_charPcS_i_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB9_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 -; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 -; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB9_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: _Z9test_charPcS_i_512: +; CHECK-SKX: # %bb.0: # %entry +; CHECK-SKX-NEXT: movl %edx, %eax +; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: xorl %ecx, %ecx +; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-SKX-NEXT: .p2align 4, 0x90 +; CHECK-SKX-NEXT: .LBB9_1: # %vector.body +; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-SKX-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-SKX-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-SKX-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-SKX-NEXT: addq $32, %rcx +; CHECK-SKX-NEXT: cmpq %rcx, %rax +; CHECK-SKX-NEXT: jne .LBB9_1 +; CHECK-SKX-NEXT: # %bb.2: # %middle.block +; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovd %xmm0, %eax +; CHECK-SKX-NEXT: vzeroupper +; CHECK-SKX-NEXT: retq +; +; CHECK-AVX512-LABEL: _Z9test_charPcS_i_512: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %edx, %eax +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: xorl %ecx, %ecx +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB9_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-AVX512-NEXT: addq $32, %rcx +; CHECK-AVX512-NEXT: cmpq %rcx, %rax +; CHECK-AVX512-NEXT: jne .LBB9_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: _Z9test_charPcS_i_512: +; CHECK-VBMI: # %bb.0: # %entry +; CHECK-VBMI-NEXT: movl %edx, %eax +; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: xorl %ecx, %ecx +; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-VBMI-NEXT: .p2align 4, 0x90 +; CHECK-VBMI-NEXT: .LBB9_1: # %vector.body +; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-VBMI-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-VBMI-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-VBMI-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-VBMI-NEXT: addq $32, %rcx +; CHECK-VBMI-NEXT: cmpq %rcx, %rax +; CHECK-VBMI-NEXT: jne .LBB9_1 +; CHECK-VBMI-NEXT: # %bb.2: # %middle.block +; CHECK-VBMI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vmovd %xmm0, %eax +; CHECK-VBMI-NEXT: vzeroupper +; CHECK-VBMI-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -310,30 +440,80 @@ @b = dso_local global [1024 x i8] zeroinitializer, align 16 define dso_local i32 @sad_16i8_256() "min-legal-vector-width"="256" { -; CHECK-LABEL: sad_16i8_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB10_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB10_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: sad_16i8_256: +; CHECK-SKX: # %bb.0: # %entry +; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-SKX-NEXT: .p2align 4, 0x90 +; CHECK-SKX-NEXT: .LBB10_1: # %vector.body +; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-SKX-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-SKX-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-SKX-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-SKX-NEXT: addq $4, %rax +; CHECK-SKX-NEXT: jne .LBB10_1 +; CHECK-SKX-NEXT: # %bb.2: # %middle.block +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovd %xmm0, %eax +; CHECK-SKX-NEXT: vzeroupper +; CHECK-SKX-NEXT: retq +; +; CHECK-AVX512-LABEL: sad_16i8_256: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB10_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-AVX512-NEXT: addq $4, %rax +; CHECK-AVX512-NEXT: jne .LBB10_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: sad_16i8_256: +; CHECK-VBMI: # %bb.0: # %entry +; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-VBMI-NEXT: .p2align 4, 0x90 +; CHECK-VBMI-NEXT: .LBB10_1: # %vector.body +; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-VBMI-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-VBMI-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-VBMI-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-VBMI-NEXT: addq $4, %rax +; CHECK-VBMI-NEXT: jne .LBB10_1 +; CHECK-VBMI-NEXT: # %bb.2: # %middle.block +; CHECK-VBMI-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vmovd %xmm0, %eax +; CHECK-VBMI-NEXT: vzeroupper +; CHECK-VBMI-NEXT: retq entry: br label %vector.body @@ -371,30 +551,80 @@ } define dso_local i32 @sad_16i8_512() "min-legal-vector-width"="512" { -; CHECK-LABEL: sad_16i8_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB11_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB11_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: sad_16i8_512: +; CHECK-SKX: # %bb.0: # %entry +; CHECK-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-SKX-NEXT: .p2align 4, 0x90 +; CHECK-SKX-NEXT: .LBB11_1: # %vector.body +; CHECK-SKX-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-SKX-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-SKX-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-SKX-NEXT: addq $4, %rax +; CHECK-SKX-NEXT: jne .LBB11_1 +; CHECK-SKX-NEXT: # %bb.2: # %middle.block +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vmovd %xmm0, %eax +; CHECK-SKX-NEXT: vzeroupper +; CHECK-SKX-NEXT: retq +; +; CHECK-AVX512-LABEL: sad_16i8_512: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB11_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-AVX512-NEXT: addq $4, %rax +; CHECK-AVX512-NEXT: jne .LBB11_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: sad_16i8_512: +; CHECK-VBMI: # %bb.0: # %entry +; CHECK-VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-VBMI-NEXT: .p2align 4, 0x90 +; CHECK-VBMI-NEXT: .LBB11_1: # %vector.body +; CHECK-VBMI-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-VBMI-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-VBMI-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-VBMI-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-VBMI-NEXT: addq $4, %rax +; CHECK-VBMI-NEXT: jne .LBB11_1 +; CHECK-VBMI-NEXT: # %bb.2: # %middle.block +; CHECK-VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-VBMI-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-VBMI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-VBMI-NEXT: vmovd %xmm0, %eax +; CHECK-VBMI-NEXT: vzeroupper +; CHECK-VBMI-NEXT: retq entry: br label %vector.body @@ -653,6 +883,32 @@ } define dso_local void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="256" { +; CHECK-SKX-VBMI-LABEL: mul256: +; CHECK-SKX-VBMI: # %bb.0: +; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1 +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4 +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0 +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-SKX-VBMI-NEXT: vzeroupper +; CHECK-SKX-VBMI-NEXT: retq +; ; CHECK-AVX512-LABEL: mul256: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 @@ -716,6 +972,22 @@ } define dso_local void @mul512(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vector-width"="512" { +; CHECK-SKX-VBMI-LABEL: mul512: +; CHECK-SKX-VBMI: # %bb.0: +; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 +; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1 +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] +; CHECK-SKX-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2 +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0 +; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126] +; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx) +; CHECK-SKX-VBMI-NEXT: vzeroupper +; CHECK-SKX-VBMI-NEXT: retq +; ; CHECK-AVX512-LABEL: mul512: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 @@ -882,6 +1154,13 @@ } define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_zeroes: +; CHECK-SKX-VBMI: # %bb.0: +; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; CHECK-SKX-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-SKX-VBMI-NEXT: retq +; ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_zeroes: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 @@ -931,6 +1210,13 @@ } define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-VBMI-LABEL: trunc_v32i16_v32i8_sign: +; CHECK-SKX-VBMI: # %bb.0: +; CHECK-SKX-VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; CHECK-SKX-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-SKX-VBMI-NEXT: retq +; ; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpsrlw $8, 32(%rdi), %ymm0 @@ -1640,6 +1926,19 @@ } define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" { +; CHECK-SKX-VBMI-LABEL: var_rotate_v16i8: +; CHECK-SKX-VBMI: # %bb.0: +; CHECK-SKX-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SKX-VBMI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; CHECK-SKX-VBMI-NEXT: vpermb %ymm0, %ymm2, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; CHECK-SKX-VBMI-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpsrlw $8, %ymm0, %ymm0 +; CHECK-SKX-VBMI-NEXT: vpmovwb %ymm0, %xmm0 +; CHECK-SKX-VBMI-NEXT: vzeroupper +; CHECK-SKX-VBMI-NEXT: retq +; ; CHECK-AVX512-LABEL: var_rotate_v16i8: ; CHECK-AVX512: # %bb.0: ; CHECK-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -47,7 +47,7 @@ ; CHECK-NEXT: movw $-5, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: vprolq $32, %xmm1, %xmm0 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 @@ -200,7 +200,7 @@ ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-1025, %ax # imm = 0xFBFF ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-NEXT: vprolq $32, %xmm1, %xmm2 ; CHECK-NEXT: vpextrw $0, %xmm2, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 diff --git a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll --- a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll +++ b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll @@ -6,33 +6,87 @@ define <4 x i32> @shuf_rot_v4i32_1032(<4 x i32> %x) { -; CHECK-LABEL: shuf_rot_v4i32_1032: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v4i32_1032: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vprolq $32, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v4i32_1032: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v4i32_1032: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-V4-NEXT: vprolq $32, %xmm0, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v4i32_1032: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-ZNVER4-NEXT: retq %x1 = add <4 x i32> %x, %x %r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %r } define <8 x i32> @shuf_rot_v8i32_10325476(<8 x i32> %x) { -; CHECK-LABEL: shuf_rot_v8i32_10325476: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v8i32_10325476: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vprolq $32, %ymm0, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v8i32_10325476: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v8i32_10325476: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-V4-NEXT: vprolq $32, %ymm0, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v8i32_10325476: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; CHECK-ZNVER4-NEXT: retq %x1 = add <8 x i32> %x, %x %r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> ret <8 x i32> %r } define <16 x i32> @shuf_rot_v16i32_1032547698111013121514(<16 x i32> %x) { -; CHECK-LABEL: shuf_rot_v16i32_1032547698111013121514: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_rot_v16i32_1032547698111013121514: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vprolq $32, %zmm0, %zmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_rot_v16i32_1032547698111013121514: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_rot_v16i32_1032547698111013121514: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-V4-NEXT: vprolq $32, %zmm0, %zmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_rot_v16i32_1032547698111013121514: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-ZNVER4-NEXT: retq %x1 = add <16 x i32> %x, %x %r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> ret <16 x i32> %r @@ -105,33 +159,87 @@ } define <4 x i32> @shuf_shr_v4i32_1U3U(<4 x i32> %x) { -; CHECK-LABEL: shuf_shr_v4i32_1U3U: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_shr_v4i32_1U3U: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpsrlq $32, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_shr_v4i32_1U3U: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_shr_v4i32_1U3U: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-V4-NEXT: vpsrlq $32, %xmm0, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_shr_v4i32_1U3U: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-ZNVER4-NEXT: retq %x1 = add <4 x i32> %x, %x %r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %r } define <8 x i32> @shuf_shr_v8i32_1U3U5U7U(<8 x i32> %x) { -; CHECK-LABEL: shuf_shr_v8i32_1U3U5U7U: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_shr_v8i32_1U3U5U7U: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vpsrlq $32, %ymm0, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_shr_v8i32_1U3U5U7U: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_shr_v8i32_1U3U5U7U: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-V4-NEXT: vpsrlq $32, %ymm0, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_shr_v8i32_1U3U5U7U: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-ZNVER4-NEXT: retq %x1 = add <8 x i32> %x, %x %r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> ret <8 x i32> %r } define <16 x i32> @shuf_shr_v16i32_U3U5U7U9U11U13U15(<16 x i32> %x) { -; CHECK-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vpsrlq $32, %zmm0, %zmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-V4-NEXT: vpsrlq $32, %zmm0, %zmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_shr_v16i32_U3U5U7U9U11U13U15: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-ZNVER4-NEXT: retq %x1 = add <16 x i32> %x, %x %r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> ret <16 x i32> %r @@ -171,33 +279,87 @@ } define <4 x i32> @shuf_shl_v4i32_U0U2(<4 x i32> %x) { -; CHECK-LABEL: shuf_shl_v4i32_U0U2: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_shl_v4i32_U0U2: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-SKX-NEXT: vpsllq $32, %xmm0, %xmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_shl_v4i32_U0U2: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_shl_v4i32_U0U2: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-V4-NEXT: vpsllq $32, %xmm0, %xmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_shl_v4i32_U0U2: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; CHECK-ZNVER4-NEXT: retq %x1 = add <4 x i32> %x, %x %r = shufflevector <4 x i32> %x1, <4 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %r } define <8 x i32> @shuf_shl_v8i32_U0U2U4U6(<8 x i32> %x) { -; CHECK-LABEL: shuf_shl_v8i32_U0U2U4U6: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_shl_v8i32_U0U2U4U6: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-SKX-NEXT: vpsllq $32, %ymm0, %ymm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_shl_v8i32_U0U2U4U6: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_shl_v8i32_U0U2U4U6: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-V4-NEXT: vpsllq $32, %ymm0, %ymm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_shl_v8i32_U0U2U4U6: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-ZNVER4-NEXT: retq %x1 = add <8 x i32> %x, %x %r = shufflevector <8 x i32> %x1, <8 x i32> zeroinitializer, <8 x i32> ret <8 x i32> %r } define <16 x i32> @shuf_shl_v16i32_U0U2U4U6U8U10U12U14(<16 x i32> %x) { -; CHECK-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: retq +; CHECK-SKX-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-SKX-NEXT: vpsllq $32, %zmm0, %zmm0 +; CHECK-SKX-NEXT: retq +; +; CHECK-ICX-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14: +; CHECK-ICX: # %bb.0: +; CHECK-ICX-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-ICX-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-ICX-NEXT: retq +; +; CHECK-V4-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14: +; CHECK-V4: # %bb.0: +; CHECK-V4-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-V4-NEXT: vpsllq $32, %zmm0, %zmm0 +; CHECK-V4-NEXT: retq +; +; CHECK-ZNVER4-LABEL: shuf_shl_v16i32_U0U2U4U6U8U10U12U14: +; CHECK-ZNVER4: # %bb.0: +; CHECK-ZNVER4-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; CHECK-ZNVER4-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-ZNVER4-NEXT: retq %x1 = add <16 x i32> %x, %x %r = shufflevector <16 x i32> %x1, <16 x i32> zeroinitializer, <16 x i32> ret <16 x i32> %r @@ -235,8 +397,3 @@ %r = shufflevector <64 x i8> %x1, <64 x i8> zeroinitializer, <64 x i32> ret <64 x i8> %r } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-ICX: {{.*}} -; CHECK-SKX: {{.*}} -; CHECK-V4: {{.*}} -; CHECK-ZNVER4: {{.*}}