Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7787,34 +7787,12 @@ return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } -/// \brief Try to lower a vector shuffle as a byte rotation. -/// -/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary -/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use -/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will -/// try to generically lower a vector shuffle through such an pattern. It -/// does not check for the profitability of lowering either as PALIGNR or -/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. -/// This matches shuffle vectors that look like: -/// -/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// \brief Try to lower a vector shuffle as a rotation. /// -/// Essentially it concatenates V1 and V2, shifts right by some number of -/// elements, and takes the low elements as the result. Note that while this is -/// specified as a *right shift* because x86 is little-endian, it is a *left -/// rotate* of the vector lanes. -static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, - ArrayRef Mask) { - // Don't accept any shuffles with zero elements. - if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) - return -1; - - // PALIGNR works on 128-bit lanes. - SmallVector RepeatedMask; - if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) - return -1; - - int NumElts = RepeatedMask.size(); +/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. +static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2, + ArrayRef Mask) { + int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] @@ -7826,7 +7804,7 @@ int Rotation = 0; SDValue Lo, Hi; for (int i = 0; i < NumElts; ++i) { - int M = RepeatedMask[i]; + int M = Mask[i]; assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index."); if (M < 0) @@ -7878,8 +7856,43 @@ V1 = Lo; V2 = Hi; + return Rotation; +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef Mask) { + // Don't accept any shuffles with zero elements. + if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) + return -1; + + // PALIGNR works on 128-bit lanes. + SmallVector RepeatedMask; + if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) + return -1; + + int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask); + if (Rotation <= 0) + return -1; + // PALIGNR rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. + int NumElts = RepeatedMask.size(); int Scale = 16 / NumElts; return Rotation * Scale; } @@ -7930,6 +7943,37 @@ DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); } +/// \brief Try to lower a vector shuffle as a dword/qword rotation. +/// +/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary +/// rotation of the concatenation of two vectors; This routine will +/// try to generically lower a vector shuffle through such an pattern. +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && + "Only 32-bit and 64-bit elements are supported!"); + + // 128/256-bit vectors are only supported with VLX. + assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) + && "VLX required for 128/256-bit vectors"); + + SDValue Lo = V1, Hi = V2; + int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask); + if (Rotation <= 0) + return SDValue(); + + return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, + DAG.getConstant(Rotation, DL, MVT::i8)); +} + /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -11504,6 +11548,13 @@ Zeroable, Subtarget, DAG)) return Shift; + // If we have VLX support, we can use VALIGN. + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -11665,6 +11716,12 @@ Zeroable, Subtarget, DAG)) return Shift; + // If we have VLX support, we can use VALIGN. + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) @@ -12093,6 +12150,12 @@ Zeroable, Subtarget, DAG)) return Shift; + // Try to use VALIGN. + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + + // Try to use PALIGNR. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -12142,6 +12205,11 @@ Zeroable, Subtarget, DAG)) return Shift; + // Try to use VALIGN. + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1445,3 +1445,45 @@ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <4 x i32> zeroinitializer ret <4 x i64> %tmp2 } + +define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1234: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1234: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1234: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0] +; AVX512VL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_1230: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1230: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i64_1230: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0] +; AVX512VL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> + ret <4 x i64> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2543,3 +2543,49 @@ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <8 x i32> zeroinitializer ret <8 x i32> %tmp2 } + +define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_12345678: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_12345678: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_12345678: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7],ymm1[0] +; AVX512VL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_12345670(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_12345670: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_12345670: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v8i32_12345670: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0] +; AVX512VL-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> + ret <8 x i32> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -338,3 +338,21 @@ %shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %a, <16 x i32> ret <16 x i32> %shuffle } + +define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; ALL: # BB#0: +; ALL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> + ret <16 x i32> %shuffle +} + +define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i32> %a) { +; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; ALL: # BB#0: +; ALL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> + ret <16 x i32> %shuffle +} Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2275,3 +2275,33 @@ %shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %shuffle } + +define <8 x i64> @shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b) { +; +; AVX512F-LABEL: shuffle_v8i64_12345678: +; AVX512F: # BB#0: +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0] +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_12345678: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0] +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_12345670(<8 x i64> %a) { +; +; AVX512F-LABEL: shuffle_v8i64_12345670: +; AVX512F: # BB#0: +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: shuffle_v8i64_12345670: +; AVX512F-32: # BB#0: +; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512F-32-NEXT: retl + %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> + ret <8 x i64> %shuffle +}