diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12120,23 +12120,32 @@ /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and -/// blends. -static SDValue lowerShuffleAsDecomposedShuffleBlend( +/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. +static SDValue lowerShuffleAsDecomposedShuffleMerge( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumEltsPerLane = NumElts / NumLanes; + // Shuffle the input elements into the desired positions in V1 and V2 and - // blend them together. - SmallVector V1Mask(Mask.size(), -1); - SmallVector V2Mask(Mask.size(), -1); - SmallVector BlendMask(Mask.size(), -1); - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= 0 && Mask[i] < Size) { - V1Mask[i] = Mask[i]; - BlendMask[i] = i; - } else if (Mask[i] >= Size) { - V2Mask[i] = Mask[i] - Size; - BlendMask[i] = i + Size; + // unpack/blend them together. + bool IsAlternating = true; + SmallVector V1Mask(NumElts, -1); + SmallVector V2Mask(NumElts, -1); + SmallVector FinalMask(NumElts, -1); + for (int i = 0; i < NumElts; ++i) { + int M = Mask[i]; + if (M >= 0 && M < NumElts) { + V1Mask[i] = M; + FinalMask[i] = i; + IsAlternating &= (i & 1) == 0; + } else if (M >= NumElts) { + V2Mask[i] = M - NumElts; + FinalMask[i] = i + NumElts; + IsAlternating &= (i & 1) == 1; } + } // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as @@ -12160,9 +12169,30 @@ return BlendPerm; } + // If the final mask is an alternating blend of vXi8/vXi16, convert to an + // UNPCKL(SHUFFLE, SHUFFLE) pattern. + // TODO: It doesn't have to be alternating - but each lane mustn't have more + // than half the elements coming from each source. + if (IsAlternating && VT.getScalarSizeInBits() < 32) { + V1Mask.assign(NumElts, -1); + V2Mask.assign(NumElts, -1); + FinalMask.assign(NumElts, -1); + for (int i = 0; i != NumElts; i += NumEltsPerLane) + for (int j = 0; j != NumEltsPerLane; ++j) { + int M = Mask[i + j]; + if (M >= 0 && M < NumElts) { + V1Mask[i + (j / 2)] = M; + FinalMask[i + j] = i + (j / 2); + } else if (M >= NumElts) { + V2Mask[i + (j / 2)] = M - NumElts; + FinalMask[i + j] = i + (j / 2) + NumElts; + } + } + } + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); - return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); + return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } /// Try to lower a vector shuffle as a bit rotation. @@ -13901,7 +13931,7 @@ // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG); // We implement this with SHUFPD which is pretty lame because it will likely @@ -14193,7 +14223,7 @@ // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG); // Try to lower by permuting the inputs into an unpack instruction. @@ -14943,8 +14973,8 @@ } // We can always bit-blend if we have to so the fallback strategy is to - // decompose into single-input permutes and blends. - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2, + // decompose into single-input permutes and blends/unpacks. + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG); } @@ -15281,9 +15311,9 @@ return Result; } - // Handle multi-input cases by blending single-input shuffles. + // Handle multi-input cases by blending/unpacking single-input shuffles. if (NumV2Elements > 0) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG); // The fallback path for single-input shuffles widens this into two v8i16 @@ -15463,7 +15493,7 @@ } /// Either split a vector in halves or decompose the shuffles and the -/// blend. +/// blend/unpack. /// /// This is provided as a good fallback for many lowerings of non-single-input /// shuffles with more than one 128-bit lane. In those cases, we want to select @@ -15498,8 +15528,8 @@ return true; }; if (DoBothBroadcast()) - return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, - Subtarget, DAG); + return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, + DAG); // If the inputs all stem from a single 128-bit lane of each input, then we // split them rather than blending because the split will decompose to @@ -15515,9 +15545,9 @@ if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); - // Otherwise, just fall back to decomposed shuffles and a blend. This requires - // that the decomposed single-input shuffles don't end up here. - return lowerShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, Subtarget, + // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This + // requires that the decomposed single-input shuffles don't end up here. + return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, DAG); } @@ -16569,7 +16599,7 @@ // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -16597,7 +16627,7 @@ // If we have AVX2 then we always want to lower with a blend because an v4 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. @@ -16679,7 +16709,7 @@ // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); // Try to create an in-lane repeating shuffle mask and then shuffle the @@ -16699,7 +16729,7 @@ return Result; // Otherwise fall back on generic blend lowering. - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG); } @@ -16794,7 +16824,7 @@ // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG); // Otherwise fall back on generic lowering. @@ -16913,7 +16943,7 @@ return Result; // Otherwise fall back on generic blend lowering. - return lowerShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, Mask, + return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG); } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -289,31 +289,13 @@ } define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { -; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; SSE41: # %bb.0: -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; SSE: # %bb.0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -2139,9 +2139,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: @@ -2161,9 +2161,9 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -2181,9 +2181,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: @@ -2203,9 +2203,9 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5086,10 +5086,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: @@ -5110,10 +5109,9 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5181,10 +5179,10 @@ ; ; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: @@ -5205,10 +5203,10 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5283,21 +5281,19 @@ ; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,7,u,4,7,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,10,11,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,28,29,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: @@ -5320,12 +5316,10 @@ ; XOPAVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] -; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,6,6,6,6,8,9,10,11,14,14,14,14] -; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] -; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5350,19 +5344,18 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,0,1,u,u,2,3,u,u,24,25,u,u,26,27,u,u,16,17,u,u,18,19] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <2,0,4,u,6,4,u,u> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,10,11,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,28,29,u,u,30,31,u,u] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: @@ -5386,10 +5379,9 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,2,3] ; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,0,1,u,u,2,3,u,u,24,25,u,u,26,27,u,u,16,17,u,u,18,19] -; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5469,10 +5461,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: @@ -5494,10 +5485,9 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -5516,10 +5506,10 @@ ; ; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: @@ -5541,10 +5531,10 @@ ; ; XOPAVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,3] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,u,u,2,3,u,u,4,5,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] -; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2793,16 +2793,16 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31] -; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: @@ -2822,9 +2822,9 @@ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,3] +; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -2842,16 +2842,16 @@ ; ; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23] -; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: @@ -2871,9 +2871,9 @@ ; ; XOPAVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23] -; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero -; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,3] +; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -3316,7 +3316,6 @@ ; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1 ; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,0,1,u,u,u,u,5,10,13,u,u,0,u,u,16,23,u,23,u,u,u,u,u,u,u,27,u,u,u,u] ; AVX512VLBW-FAST-NEXT: retq - ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: ; AVX512VLVBMI: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -67,16 +67,16 @@ ; KNL: ## %bb.0: ; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3] -; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,6,7,u,u,12,13,u,u,2,3,u,u,0,1,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[6,7,12,13,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,u,u,u,u,u,u,u,u,u,u] ; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] -; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,u,u,4,5,u,u,2,3,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u] -; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,8,9,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] ; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm1 ; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 ; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15] ; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,6,7,u,u,12,13,u,u,2,3,u,u,0,1,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17] -; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,12,13,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] ; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ;