Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7823,12 +7823,10 @@ /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). /// -/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ /// byte-shift instructions. The mask must consist of a shifted sequential /// shuffle from one of the input vectors and zeroable elements for the /// remaining 'shifted in' elements. -/// -/// Note that this only handles 128-bit vector widths currently. static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, SelectionDAG &DAG) { @@ -7836,63 +7834,56 @@ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - int Size = Mask.size(); - int Scale = 16 / Size; - - for (int Shift = 1; Shift < Size; Shift++) { - int ByteShift = Shift * Scale; - - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] - bool ZeroableRight = true; - for (int i = Size - Shift; i < Size; i++) { - ZeroableRight &= Zeroable[i]; - } - - if (ZeroableRight) { - bool ValidShiftRight1 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); - bool ValidShiftRight2 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); - - if (ValidShiftRight1 || ValidShiftRight2) { - // Cast the inputs to v2i64 to match PSRLDQ. - SDValue &TargetV = ValidShiftRight1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + int Scale = 16 / NumLaneElts; + MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes); + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + auto MatchByteShift = [&](int Shift) -> SDValue { + bool MatchLeft = true, MatchRight = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < Shift; ++i) + MatchLeft &= Zeroable[l + i]; + for (int i = NumLaneElts - Shift; i < NumLaneElts; ++i) + MatchRight &= Zeroable[l + i]; } + if (!(MatchLeft || MatchRight)) + return SDValue(); - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - bool ZeroableLeft = true; - for (int i = 0; i < Shift; i++) { - ZeroableLeft &= Zeroable[i]; - } - - if (ZeroableLeft) { - bool ValidShiftLeft1 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); - bool ValidShiftLeft2 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); - - if (ValidShiftLeft1 || ValidShiftLeft2) { - // Cast the inputs to v2i64 to match PSLLDQ. - SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + bool MatchV1 = true, MatchV2 = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + unsigned Pos = MatchLeft ? Shift + l : l; + unsigned Low = MatchLeft ? l : Shift + l; + unsigned Len = NumLaneElts - Shift; + MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low); + MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts); } - } + if (!(MatchV1 || MatchV2)) + return SDValue(); + + int ByteShift = Shift * Scale; + unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ; + SDValue V = MatchV1 ? V1 : V2; + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getNode(Op, DL, ShiftVT, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + for (int Shift = 1; Shift < NumLaneElts; ++Shift) + if (SDValue S = MatchByteShift(Shift)) + return S; + + // no match return SDValue(); } @@ -10677,6 +10668,11 @@ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single @@ -10852,6 +10848,11 @@ DL, MVT::v8i32, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -10940,6 +10941,11 @@ DL, MVT::v16i16, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -11023,6 +11029,11 @@ DL, MVT::v32i8, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4296,6 +4296,12 @@ (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; + + // Shift up / down and insert zero's. + def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))), + (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))), + (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>; } let Predicates = [UseSSE2] in { Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1363,6 +1363,40 @@ ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: +; AVX1: # BB#0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} + ; ; Shuffle to logical bit shifts ; Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1655,6 +1655,40 @@ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: +; AVX1: # BB#0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> + ret <32 x i8> %shuffle +} + ; ; Shuffle to logical bit shifts ; Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -748,6 +748,38 @@ ret <4 x i64> %shuffle } +define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_z4z6: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_z4z6: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_5zuz: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_5zuz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> + ret <4 x i64> %shuffle +} + define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: stress_test1: ; ALL: retq Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1822,6 +1822,38 @@ ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_zuu8zuuc: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_zuu8zuuc: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v4i32_9ubzdefz(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v4i32_9ubzdefz: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_9ubzdefz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x float> @splat_mem_v8f32_2(float* %p) { ; ALL-LABEL: splat_mem_v8f32_2: ; ALL: # BB#0: