Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7791,63 +7791,58 @@ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - int Size = Mask.size(); - int Scale = 16 / Size; - - for (int Shift = 1; Shift < Size; Shift++) { - int ByteShift = Shift * Scale; - - // PSRLDQ : (little-endian) right byte shift - // [ 5, 6, 7, zz, zz, zz, zz, zz] - // [ -1, 5, 6, 7, zz, zz, zz, zz] - // [ 1, 2, -1, -1, -1, -1, zz, zz] - bool ZeroableRight = true; - for (int i = Size - Shift; i < Size; i++) { - ZeroableRight &= Zeroable[i]; - } - - if (ZeroableRight) { - bool ValidShiftRight1 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); - bool ValidShiftRight2 = - isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); - - if (ValidShiftRight1 || ValidShiftRight2) { - // Cast the inputs to v2i64 to match PSRLDQ. - SDValue &TargetV = ValidShiftRight1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); - } + int NumElts = VT.getVectorNumElements(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + int Scale = 16 / NumLaneElts; + MVT ShiftVT = MVT::getVectorVT(MVT::i64, 2 * NumLanes); + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + auto MatchByteShift = [&](int Shift) -> SDValue { + bool MatchLeft = true, MatchRight = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < Shift; i++) + MatchLeft &= Zeroable[l + i]; + for (int i = NumLaneElts - Shift; i < NumLaneElts; i++) + MatchRight &= Zeroable[l + i]; } + if (!(MatchLeft || MatchRight)) + return SDValue(); - // PSLLDQ : (little-endian) left byte shift - // [ zz, 0, 1, 2, 3, 4, 5, 6] - // [ zz, zz, -1, -1, 2, 3, 4, -1] - // [ zz, zz, zz, zz, zz, zz, -1, 1] - bool ZeroableLeft = true; - for (int i = 0; i < Shift; i++) { - ZeroableLeft &= Zeroable[i]; - } - - if (ZeroableLeft) { - bool ValidShiftLeft1 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); - bool ValidShiftLeft2 = - isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); - - if (ValidShiftLeft1 || ValidShiftLeft2) { - // Cast the inputs to v2i64 to match PSLLDQ. - SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; - SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); - SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, - DAG.getConstant(ByteShift * 8, MVT::i8)); - return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + bool MatchV1 = true, MatchV2 = true; + for (int l = 0; l < NumElts; l += NumLaneElts) { + for (int i = 0; i < NumLaneElts; i += Scale) { + unsigned Pos = MatchLeft ? Shift + l : l; + unsigned Low = MatchLeft ? l : Shift + l; + unsigned Len = NumLaneElts - Shift; + MatchV1 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low); + MatchV2 &= isSequentialOrUndefInRange(Mask, Pos, Len, Low + NumElts); } } - } + if (!(MatchV1 || MatchV2)) + return SDValue(); + + int ByteShift = Shift * Scale; + unsigned Op = MatchRight ? X86ISD::VSRLDQ : X86ISD::VSHLDQ; + SDValue V = MatchV1 ? V1 : V2; + V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V); + V = DAG.getNode(Op, DL, ShiftVT, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, V); + }; + for (int Shift = 1; Shift < NumLaneElts; Shift++) + if (SDValue S = MatchByteShift(Shift)) + return S; + + // no match return SDValue(); } @@ -10576,6 +10571,11 @@ return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i64, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, // we will be able to shuffle even across lanes the other input in a single @@ -10751,6 +10751,11 @@ DL, MVT::v8i32, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i32, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -10839,6 +10844,11 @@ DL, MVT::v16i16, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i16, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -10922,6 +10932,11 @@ DL, MVT::v32i8, V1, V2, Mask, DAG)) return Shift; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v32i8, V1, V2, Mask, DAG)) + return Shift; + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4497,6 +4497,12 @@ (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2), (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>; + + // Shift up / down and insert zero's. + def : Pat<(v4i64 (X86vshldq VR256:$src, (i8 imm:$amt))), + (VPSLLDQYri VR256:$src, (BYTE_imm imm:$amt))>; + def : Pat<(v4i64 (X86vshrdq VR256:$src, (i8 imm:$amt))), + (VPSRLDQYri VR256:$src, (BYTE_imm imm:$amt))>; } let Predicates = [UseSSE2] in { Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1363,6 +1363,47 @@ ret <16 x i16> %shuffle } +define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) { +; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> + ret <16 x i16> %shuffle +} + ; ; Shuffle to logical bit shifts ; Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1655,6 +1655,52 @@ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) { +; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: movl $128, %eax +; AVX1-NEXT: vmovd %eax, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> + ret <32 x i8> %shuffle +} + ; ; Shuffle to logical bit shifts ; Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -748,6 +748,41 @@ ret <4 x i64> %shuffle } +define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_z4z6: +; AVX1: # BB#0: +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_z4z6: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) { +; AVX1-LABEL: shuffle_v4i64_5zuz: +; AVX1: # BB#0: +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_5zuz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> + ret <4 x i64> %shuffle +} + define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: stress_test1: ; ALL: retq Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1822,6 +1822,38 @@ ret <8 x i32> %shuffle } +define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v8i32_zuu8zuuc: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_zuu8zuuc: +; AVX2: # BB#0: +; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v4i32_9ubzdefz(<8 x i32> %a) { +; AVX1-LABEL: shuffle_v4i32_9ubzdefz: +; AVX1: # BB#0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i32_9ubzdefz: +; AVX2: # BB#0: +; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> + ret <8 x i32> %shuffle +} + define <8 x float> @splat_mem_v8f32_2(float* %p) { ; ALL-LABEL: splat_mem_v8f32_2: ; ALL: # BB#0: