Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7788,34 +7788,12 @@ return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } -/// \brief Try to lower a vector shuffle as a byte rotation. -/// -/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary -/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use -/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will -/// try to generically lower a vector shuffle through such an pattern. It -/// does not check for the profitability of lowering either as PALIGNR or -/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. -/// This matches shuffle vectors that look like: -/// -/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// \brief Try to lower a vector shuffle as a rotation. /// -/// Essentially it concatenates V1 and V2, shifts right by some number of -/// elements, and takes the low elements as the result. Note that while this is -/// specified as a *right shift* because x86 is little-endian, it is a *left -/// rotate* of the vector lanes. -static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef Mask) { - // Don't accept any shuffles with zero elements. - if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) - return -1; - - // PALIGNR works on 128-bit lanes. - SmallVector RepeatedMask; - if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) - return -1; - - int NumElts = RepeatedMask.size(); +/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. +static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, + ArrayRef Mask) { + int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] @@ -7827,7 +7805,7 @@ int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { - int M = RepeatedMask[i]; + int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) @@ -7879,8 +7857,43 @@ V1 = Lo; V2 = Hi; + return Rotation; +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef Mask) { + // Don't accept any shuffles with zero elements. + if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) + return -1; + + // PALIGNR works on 128-bit lanes. + SmallVector RepeatedMask; + if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) + return -1; + + int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); + if (Rotation <= 0) + return -1; + // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. + int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } @@ -7931,6 +7944,37 @@ DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } +/// \brief Try to lower a vector shuffle as a dword/qword rotation. +/// +/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary +/// rotation of the concatenation of two vectors; This routine will +/// try to generically lower a vector shuffle through such an pattern. +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && + "Only 32-bit and 64-bit elements are supported!"); + + // 128/256-bit vectors are only supported with VLX. + assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) + && "VLX required for 128/256-bit vectors"); + + SDValue Lo = V1, Hi = V2; + int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); + if (Rotation <= 0) + return SDValue(); + + return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, + DAG.getConstant(Rotation, DL, MVT::i8)); +} + /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -11505,6 +11549,13 @@ Zeroable, Subtarget, DAG)) return Shift; + // If we have VLX support, we can use VALIGN. + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -11666,6 +11717,12 @@ Zeroable, Subtarget, DAG)) return Shift; + // If we have VLX support, we can use VALIGN. + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) @@ -12094,6 +12151,12 @@ Zeroable, Subtarget, DAG)) return Shift; + // Try to use VALIGN. + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -12143,6 +12206,11 @@ Zeroable, Subtarget, DAG)) return Shift; + // Try to use VALIGN. + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1462,8 +1462,7 @@ ; ; AVX512VL-LABEL: shuffle_v4i64_1234: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] +; AVX512VL-NEXT: valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2562,8 +2562,7 @@ ; ; AVX512VL-LABEL: shuffle_v8i32_12345678: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm2 = [1,2,3,4,5,6,7,8] -; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7],ymm1[0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2585,8 +2584,7 @@ ; ; AVX512VL-LABEL: shuffle_v8i32_12345670: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa32 {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] -; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0] ; AVX512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> ret <8 x i32> %shuffle Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -342,8 +342,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16] -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -352,8 +351,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i32> %a) { ; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm1 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] -; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; ALL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> ret <16 x i32> %shuffle Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2280,14 +2280,12 @@ ; ; AVX512F-LABEL: shuffle_v8i64_12345678: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8] -; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_12345678: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0] -; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 +; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -2297,14 +2295,12 @@ ; ; AVX512F-LABEL: shuffle_v8i64_12345670: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,2,3,4,5,6,7,0] -; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_12345670: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0] -; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> ret <8 x i64> %shuffle