diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -527,6 +527,14 @@ "HasFastVariablePerLaneShuffle", "true", "Per-lane shuffles with variable masks are fast">; +// Skylake server has avx512 for `vpro{r|l}{d|q}` but not that extra +// shuffle ports that other avx512 targets have so prefer shuffle with +// shifts/rotate. +def TuningPreferShiftShuffle : SubtargetFeature<"faster-shift-than-shuffle", + "PreferLowerShuffleAsShift", "true", + "Shifts are faster (or as fast) as shuffle">; + + // On some X86 processors, a vzeroupper instruction should be inserted after // using ymm/zmm registers before executing code that may use SSE instructions. def TuningInsertVZEROUPPER @@ -840,7 +848,8 @@ TuningPrefer256Bit, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningPreferShiftShuffle]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15566,6 +15566,12 @@ int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); + // Try to use shift instructions. + if (Subtarget.hasFasterShiftThanShuffle()) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Shift; + if (NumV2Elements == 0) { // Try to use broadcast unless the mask only has one non-undef element. if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { @@ -15595,9 +15601,10 @@ return Extract; // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Shift; + if (!Subtarget.hasFasterShiftThanShuffle()) + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) @@ -18278,31 +18285,36 @@ Subtarget, DAG)) return Broadcast; - if (V2.isUndef()) { - // When the shuffle is mirrored between the 128-bit lanes of the unit, we - // can use lower latency instructions that will operate on both lanes. - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { - SmallVector PSHUFDMask; - narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask); - return DAG.getBitcast( - MVT::v4i64, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, - DAG.getBitcast(MVT::v8i32, V1), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); - } + for (unsigned Order = 0; Order < 2; ++Order) { + if (Subtarget.hasFasterShiftThanShuffle() ? (Order == 1) : (Order == 0)) { + if (V2.isUndef()) { + // When the shuffle is mirrored between the 128-bit lanes of the unit, + // we can use lower latency instructions that will operate on both + // lanes. + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + SmallVector PSHUFDMask; + narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask); + return DAG.getBitcast( + MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getBitcast(MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + } - // AVX2 provides a direct instruction for permuting a single input across - // lanes. - return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, - getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + // AVX2 provides a direct instruction for permuting a single input + // across lanes. + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); + } + } else { + // Try to use shift instructions. + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Shift; + } } - // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Shift; - // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask, @@ -18542,22 +18554,26 @@ SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); - if (Is128BitLaneRepeatedShuffle) { - assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); - if (V2.isUndef()) - return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); - - // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) - return V; + for (unsigned Order = 0; Order < 2; ++Order) { + if (Subtarget.hasFasterShiftThanShuffle() ? (Order == 1) : (Order == 0)) { + if (Is128BitLaneRepeatedShuffle) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (V2.isUndef()) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; + } + } else { + // Try to use shift instructions. + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Shift; + } } - - // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Shift; - // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask, @@ -19171,35 +19187,42 @@ assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (V2.isUndef()) { - // When the shuffle is mirrored between the 128-bit lanes of the unit, we - // can use lower latency instructions that will operate on all four - // 128-bit lanes. - SmallVector Repeated128Mask; - if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { - SmallVector PSHUFDMask; - narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask); - return DAG.getBitcast( - MVT::v8i64, - DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, - DAG.getBitcast(MVT::v16i32, V1), - getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); - } + for (unsigned Order = 0; Order < 2; ++Order) { + if (Subtarget.hasFasterShiftThanShuffle() ? (Order == 1) : (Order == 0)) { + if (V2.isUndef()) { + // When the shuffle is mirrored between the 128-bit lanes of the unit, + // we can use lower latency instructions that will operate on all four + // 128-bit lanes. + SmallVector Repeated128Mask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, + Repeated128Mask)) { + SmallVector PSHUFDMask; + narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask); + return DAG.getBitcast( + MVT::v8i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, + DAG.getBitcast(MVT::v16i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); + } - SmallVector Repeated256Mask; - if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) - return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, - getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); - } + SmallVector Repeated256Mask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) + return DAG.getNode( + X86ISD::VPERMI, DL, MVT::v8i64, V1, + getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); + } - if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, - V2, Subtarget, DAG)) - return Shuf128; + if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, + V1, V2, Subtarget, DAG)) + return Shuf128; + } else { - // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Shift; + // Try to use shift instructions. + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Shift; + } + } // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask, @@ -19249,22 +19272,27 @@ SmallVector RepeatedMask; bool Is128BitLaneRepeatedShuffle = is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); - if (Is128BitLaneRepeatedShuffle) { - assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); - if (V2.isUndef()) - return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); - - // Use dedicated unpack instructions for masks that match their pattern. - if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) - return V; + for (unsigned Order = 0; Order < 2; ++Order) { + if (Subtarget.hasFasterShiftThanShuffle() ? (Order == 1) : (Order == 0)) { + if (Is128BitLaneRepeatedShuffle) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (V2.isUndef()) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return V; + } + } else { + // Try to use shift instructions. + if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Shift; + } } - // Try to use shift instructions. - if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, - Zeroable, Subtarget, DAG)) - return Shift; - // Try to use VALIGN. if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) @@ -38636,85 +38664,91 @@ } } - // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. - // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we - // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && - !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { - // Narrow the repeated mask to create 32-bit element permutes. - SmallVector WordMask = RepeatedMask; - if (MaskScalarSizeInBits == 64) - narrowShuffleMaskElts(2, RepeatedMask, WordMask); - - Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); - ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); - PermuteImm = getV4X86ShuffleImm(WordMask); - return true; - } - } - - // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. - if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && - ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || - (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { - SmallVector RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { - ArrayRef LoMask(RepeatedMask.data() + 0, 4); - ArrayRef HiMask(RepeatedMask.data() + 4, 4); - - // PSHUFLW: permute lower 4 elements only. - if (isUndefOrInRange(LoMask, 0, 4) && - isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { - Shuffle = X86ISD::PSHUFLW; - ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); - PermuteImm = getV4X86ShuffleImm(LoMask); - return true; + for (unsigned Order = 0; Order < 2; ++Order) { + if (Subtarget.hasFasterShiftThanShuffle() ? (Order == 1) : (Order == 0)) { + // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && + !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + // Narrow the repeated mask to create 32-bit element permutes. + SmallVector WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + narrowShuffleMaskElts(2, RepeatedMask, WordMask); + + Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); + ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; + } } - // PSHUFHW: permute upper 4 elements only. - if (isUndefOrInRange(HiMask, 4, 8) && - isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { - // Offset the HiMask so that we can create the shuffle immediate. - int OffsetHiMask[4]; - for (int i = 0; i != 4; ++i) - OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); + // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { + SmallVector RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + ArrayRef LoMask(RepeatedMask.data() + 0, 4); + ArrayRef HiMask(RepeatedMask.data() + 4, 4); + + // PSHUFLW: permute lower 4 elements only. + if (isUndefOrInRange(LoMask, 0, 4) && + isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { + Shuffle = X86ISD::PSHUFLW; + ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); + PermuteImm = getV4X86ShuffleImm(LoMask); + return true; + } - Shuffle = X86ISD::PSHUFHW; - ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); - PermuteImm = getV4X86ShuffleImm(OffsetHiMask); - return true; + // PSHUFHW: permute upper 4 elements only. + if (isUndefOrInRange(HiMask, 4, 8) && + isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { + // Offset the HiMask so that we can create the shuffle immediate. + int OffsetHiMask[4]; + for (int i = 0; i != 4; ++i) + OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); + + Shuffle = X86ISD::PSHUFHW; + ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); + PermuteImm = getV4X86ShuffleImm(OffsetHiMask); + return true; + } + } + } + } else { + // Attempt to match against byte/bit shifts. + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { + int ShiftAmt = + matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, + 0, Zeroable, Subtarget); + if (0 < ShiftAmt && + (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || + 32 <= ShuffleVT.getScalarSizeInBits())) { + PermuteImm = (unsigned)ShiftAmt; + return true; + } } - } - } - - // Attempt to match against byte/bit shifts. - if (AllowIntDomain && - ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || - (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { - int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, - Mask, 0, Zeroable, Subtarget); - if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || - 32 <= ShuffleVT.getScalarSizeInBits())) { - PermuteImm = (unsigned)ShiftAmt; - return true; - } - } - // Attempt to match against bit rotates. - if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && - ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || - Subtarget.hasAVX512())) { - int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, - Subtarget, Mask); - if (0 < RotateAmt) { - Shuffle = X86ISD::VROTLI; - PermuteImm = (unsigned)RotateAmt; - return true; + // Attempt to match against bit rotates. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && + ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || + Subtarget.hasAVX512())) { + int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, + Subtarget, Mask); + if (0 < RotateAmt) { + Shuffle = X86ISD::VROTLI; + PermuteImm = (unsigned)RotateAmt; + return true; + } + } } } diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -249,6 +249,7 @@ return hasBWI() && canExtendTo512DQ(); } + bool hasFasterShiftThanShuffle() const { return PreferLowerShuffleAsShift; } // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -88,6 +88,7 @@ X86::TuningInsertVZEROUPPER, X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts, + X86::TuningPreferShiftShuffle, // Perf-tuning flags. X86::TuningFastGather, diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll --- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -16,7 +16,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper @@ -43,7 +43,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SKX-NEXT: vpsrlq $32, %xmm0, %xmm1 ; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -170,40 +170,40 @@ } define dso_local i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" { -; CHECK-LABEL: _Z9test_charPcS_i_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB8_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 -; CHECK-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 -; CHECK-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 -; CHECK-NEXT: vpaddd %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 -; CHECK-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 -; CHECK-NEXT: vpaddd %ymm1, %ymm3, %ymm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB8_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: _Z9test_charPcS_i_256: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %edx, %eax +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: xorl %ecx, %ecx +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB8_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rdi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm4 +; CHECK-AVX512-NEXT: vpmovsxbw 16(%rsi,%rcx), %ymm5 +; CHECK-AVX512-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3 +; CHECK-AVX512-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 +; CHECK-AVX512-NEXT: vpmaddwd %ymm4, %ymm3, %ymm3 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm3, %ymm1 +; CHECK-AVX512-NEXT: addq $32, %rcx +; CHECK-AVX512-NEXT: cmpq %rcx, %rax +; CHECK-AVX512-NEXT: jne .LBB8_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -241,35 +241,35 @@ } define dso_local i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" { -; CHECK-LABEL: _Z9test_charPcS_i_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB9_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 -; CHECK-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 -; CHECK-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 -; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 -; CHECK-NEXT: addq $32, %rcx -; CHECK-NEXT: cmpq %rcx, %rax -; CHECK-NEXT: jne .LBB9_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: _Z9test_charPcS_i_512: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %edx, %eax +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: xorl %ecx, %ecx +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB9_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %zmm2 +; CHECK-AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %zmm3 +; CHECK-AVX512-NEXT: vpmaddwd %zmm2, %zmm3, %zmm2 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-AVX512-NEXT: addq $32, %rcx +; CHECK-AVX512-NEXT: cmpq %rcx, %rax +; CHECK-AVX512-NEXT: jne .LBB9_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: %3 = zext i32 %2 to i64 br label %vector.body @@ -310,30 +310,30 @@ @b = dso_local global [1024 x i8] zeroinitializer, align 16 define dso_local i32 @sad_16i8_256() "min-legal-vector-width"="256" { -; CHECK-LABEL: sad_16i8_256: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB10_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm2 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB10_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: sad_16i8_256: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB10_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm2 +; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2 +; CHECK-AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; CHECK-AVX512-NEXT: addq $4, %rax +; CHECK-AVX512-NEXT: jne .LBB10_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: br label %vector.body @@ -371,30 +371,30 @@ } define dso_local i32 @sad_16i8_512() "min-legal-vector-width"="512" { -; CHECK-LABEL: sad_16i8_512: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: movq $-1024, %rax # imm = 0xFC00 -; CHECK-NEXT: .p2align 4, 0x90 -; CHECK-NEXT: .LBB11_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovdqu a+1024(%rax), %xmm1 -; CHECK-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: addq $4, %rax -; CHECK-NEXT: jne .LBB11_1 -; CHECK-NEXT: # %bb.2: # %middle.block -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: sad_16i8_512: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: movq $-1024, %rax # imm = 0xFC00 +; CHECK-AVX512-NEXT: .p2align 4, 0x90 +; CHECK-AVX512-NEXT: .LBB11_1: # %vector.body +; CHECK-AVX512-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-AVX512-NEXT: vmovdqu a+1024(%rax), %xmm1 +; CHECK-AVX512-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 +; CHECK-AVX512-NEXT: addq $4, %rax +; CHECK-AVX512-NEXT: jne .LBB11_1 +; CHECK-AVX512-NEXT: # %bb.2: # %middle.block +; CHECK-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpsrlq $32, %xmm0, %xmm1 +; CHECK-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vmovd %xmm0, %eax +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq entry: br label %vector.body @@ -952,44 +952,44 @@ } define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: zext_v16i8_v16i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vmovdqa %ymm0, (%rdi) -; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) -; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) -; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: zext_v16i8_v16i64: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; CHECK-AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdi) +; CHECK-AVX512-NEXT: vmovdqa %ymm1, 64(%rdi) +; CHECK-AVX512-NEXT: vmovdqa %ymm3, 96(%rdi) +; CHECK-AVX512-NEXT: vmovdqa %ymm2, 32(%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = zext <16 x i8> %x to <16 x i64> store <16 x i64> %a, <16 x i64>* %y ret void } define dso_local void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal-vector-width"="256" { -; CHECK-LABEL: sext_v16i8_v16i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovsxwq %xmm2, %ymm2 -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] -; CHECK-NEXT: vpmovsxwq %xmm3, %ymm3 -; CHECK-NEXT: vpmovsxwq %xmm1, %ymm1 -; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 -; CHECK-NEXT: vmovdqa %ymm0, (%rdi) -; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi) -; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi) -; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK-AVX512-LABEL: sext_v16i8_v16i64: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vpmovsxbw %xmm0, %ymm1 +; CHECK-AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vpmovsxwq %xmm2, %ymm2 +; CHECK-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-AVX512-NEXT: vpmovsxwq %xmm3, %ymm3 +; CHECK-AVX512-NEXT: vpmovsxwq %xmm1, %ymm1 +; CHECK-AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 +; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdi) +; CHECK-AVX512-NEXT: vmovdqa %ymm1, 64(%rdi) +; CHECK-AVX512-NEXT: vmovdqa %ymm3, 96(%rdi) +; CHECK-AVX512-NEXT: vmovdqa %ymm2, 32(%rdi) +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq %a = sext <16 x i8> %x to <16 x i64> store <16 x i64> %a, <16 x i64>* %y ret void diff --git a/llvm/test/CodeGen/X86/pr57340.ll b/llvm/test/CodeGen/X86/pr57340.ll --- a/llvm/test/CodeGen/X86/pr57340.ll +++ b/llvm/test/CodeGen/X86/pr57340.ll @@ -47,7 +47,7 @@ ; CHECK-NEXT: movw $-5, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm0 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm0 @@ -88,7 +88,7 @@ ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-17, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; CHECK-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrw $0, %xmm5, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax @@ -128,7 +128,7 @@ ; CHECK-NEXT: movw $-65, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,3,3,3] +; CHECK-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrw $0, %xmm7, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm7 @@ -200,7 +200,7 @@ ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-1025, %ax # imm = 0xFBFF ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm2 ; CHECK-NEXT: vpextrw $0, %xmm2, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2 @@ -234,7 +234,7 @@ ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: movw $-4097, %ax # imm = 0xEFFF ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrw $0, %xmm2, %eax ; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: movzwl %ax, %eax @@ -269,7 +269,7 @@ ; CHECK-NEXT: movw $-16385, %ax # imm = 0xBFFF ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: kandw %k1, %k0, %k0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrw $0, %xmm2, %eax ; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: vmovd %eax, %xmm2