Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -10935,6 +10935,69 @@ DAG.getConstant(Rotation, DL, MVT::i8)); } +/// Try to lower a vector shuffle as a byte shift sequence. +static SDValue lowerVectorShuffleAsByteShiftMask( + const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + assert(VT.is128BitVector() && "Only 128-bit vectors supported"); + + // We need a shuffle that has zeros at one/both ends and a sequential + // shuffle from one source within. + unsigned ZeroLo = Zeroable.countTrailingOnes(); + unsigned ZeroHi = Zeroable.countLeadingOnes(); + if (!ZeroLo && !ZeroHi) + return SDValue(); + + unsigned NumElts = Mask.size(); + unsigned Len = NumElts - (ZeroLo + ZeroHi); + if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) + return SDValue(); + + unsigned Scale = VT.getScalarSizeInBits() / 8; + ArrayRef StubMask = Mask.slice(ZeroLo, Len); + if (!isUndefOrInRange(StubMask, 0, NumElts) && + !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) + return SDValue(); + + SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; + Res = DAG.getBitcast(MVT::v16i8, Res); + + // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an + // inner sequential set of elements, possibly offset: + // 01234567 --> zzzzzz01 --> 1zzzzzzz + // 01234567 --> 4567zzzz --> zzzzz456 + // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz + if (ZeroLo == 0) { + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroHi, DL, MVT::i8)); + } else if (ZeroHi == 0) { + unsigned Shift = Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else if (!Subtarget.hasSSSE3()) { + // If we don't have PSHUFB then its worth avoiding an AND constant mask + // by performing 3 byte shifts. Shuffle combining can kick in above that. + // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. + unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Shift += Mask[ZeroLo] % NumElts; + Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * Shift, DL, MVT::i8)); + Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, + DAG.getConstant(Scale * ZeroLo, DL, MVT::i8)); + } else + return SDValue(); + + return DAG.getBitcast(VT, Res); +} + /// Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -13339,6 +13402,11 @@ lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return V; + // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -13588,6 +13656,11 @@ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; + // Try to use byte shift instructions to mask. + if (SDValue V = lowerVectorShuffleAsByteShiftMask( + DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) + return V; + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input Index: llvm/trunk/test/CodeGen/X86/buildvec-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/buildvec-extract.ll +++ llvm/trunk/test/CodeGen/X86/buildvec-extract.ll @@ -402,24 +402,16 @@ } define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { -; SSE2-LABEL: extract1_i16_zext_insert0_i64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract1_i16_zext_insert0_i64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: retq +; SSE-LABEL: extract1_i16_zext_insert0_i64_undef: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -446,24 +438,16 @@ } define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) { -; SSE2-LABEL: extract2_i16_zext_insert0_i64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract2_i16_zext_insert0_i64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: retq +; SSE-LABEL: extract2_i16_zext_insert0_i64_undef: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; ; AVX-LABEL: extract2_i16_zext_insert0_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -526,8 +510,9 @@ define <2 x i64> @extract0_i16_zext_insert1_i64_undef(<8 x i16> %x) { ; SSE2-LABEL: extract0_i16_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract0_i16_zext_insert1_i64_undef: @@ -615,8 +600,9 @@ define <2 x i64> @extract2_i16_zext_insert1_i64_undef(<8 x i16> %x) { ; SSE2-LABEL: extract2_i16_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i16_zext_insert1_i64_undef: @@ -661,8 +647,9 @@ define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) { ; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract3_i16_zext_insert1_i64_undef: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1535,19 +1535,9 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz: ; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz: @@ -1569,70 +1559,65 @@ } define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { -; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,4] -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> , <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) { -; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: -; SSE2: # %bb.0: -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6] -; SSE2-NEXT: packuswb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: +; SSE: # %bb.0: +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> , <16 x i32> ret <16 x i8> %shuffle } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2477,92 +2477,99 @@ ; PR40306 define <8 x i16> @shuffle_v8i16_9zzzuuuu(<8 x i16> %x) { -; SSE2-LABEL: shuffle_v8i16_9zzzuuuu: -; SSE2: # %bb.0: -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_9zzzuuuu: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_9zzzuuuu: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_9zzzuuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_9zzzuuuu: -; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_9zzzuuuu: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_9zzzuuuu: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_9zzzuuuu: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_9zzzuuuu: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_9zzzuuuu: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT: retq %r = shufflevector <8 x i16> zeroinitializer, <8 x i16> %x, <8 x i32> ret <8 x i16> %r } ; PR40318 define <8 x i16> @shuffle_v8i16_2zzzuuuu(<8 x i16> %x) { -; SSE2-LABEL: shuffle_v8i16_2zzzuuuu: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_2zzzuuuu: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; -; SSSE3-LABEL: shuffle_v8i16_2zzzuuuu: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] -; SSSE3-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_2zzzuuuu: +; AVX1: # %bb.0: +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: retq ; -; SSE41-LABEL: shuffle_v8i16_2zzzuuuu: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_2zzzuuuu: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-SLOW-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_2zzzuuuu: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: retq +; AVX2-FAST-LABEL: shuffle_v8i16_2zzzuuuu: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_2zzzuuuu: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_2zzzuuuu: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT: retq %r = shufflevector <8 x i16> %x, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %r } define <8 x i16> @shuffle_v8i16_3uu6zzzz(<8 x i16> %x) { -; SSE2-LABEL: shuffle_v8i16_3uu6zzzz: -; SSE2: # %bb.0: -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v8i16_3uu6zzzz: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,u,u,u,u,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v8i16_3uu6zzzz: -; SSE41: # %bb.0: -; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v8i16_3uu6zzzz: +; SSE: # %bb.0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_3uu6zzzz: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i16_3uu6zzzz: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v8i16_3uu6zzzz: @@ -2572,8 +2579,8 @@ ; ; AVX512VL-SLOW-LABEL: shuffle_v8i16_3uu6zzzz: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512VL-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512VL-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX512VL-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i16_3uu6zzzz: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-sse4a.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -410,8 +410,9 @@ define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) { ; AMD10H-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AMD10H: # %bb.0: -; AMD10H-NEXT: psrlq $16, %xmm0 -; AMD10H-NEXT: pand {{.*}}(%rip), %xmm0 +; AMD10H-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AMD10H-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AMD10H-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AMD10H-NEXT: retq ; ; BTVER1-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: