Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -96,7 +96,7 @@ /// BUILD_VECTOR where all of the elements are ~0 or undef. bool ISD::isBuildVectorAllOnes(const SDNode *N) { // Look through a bit convert. - if (N->getOpcode() == ISD::BITCAST) + while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; @@ -144,7 +144,7 @@ /// BUILD_VECTOR where all of the elements are 0 or undef. bool ISD::isBuildVectorAllZeros(const SDNode *N) { // Look through a bit convert. - if (N->getOpcode() == ISD::BITCAST) + while (N->getOpcode() == ISD::BITCAST) N = N->getOperand(0).getNode(); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -7453,12 +7453,13 @@ /// \brief Try to lower a vector shuffle as a byte rotation. /// -/// We have a generic PALIGNR instruction in x86 that will do an arbitrary -/// byte-rotation of the concatenation of two vectors. This routine will -/// try to generically lower a vector shuffle through such an instruction. It -/// does not check for the availability of PALIGNR-based lowerings, only the -/// applicability of this strategy to the given mask. This matches shuffle -/// vectors that look like: +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// @@ -7471,6 +7472,7 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); @@ -7531,21 +7533,40 @@ else if (!Hi) Hi = Lo; - // Cast the inputs to v16i8 to match PALIGNR. - Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); - Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); - assert(VT.getSizeInBits() == 128 && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); + // The actual rotate instruction rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector. int Scale = 16 / Mask.size(); + // SSSE3 targets can use the palignr instruction + if (Subtarget->hasSSSE3()) { + // Cast the inputs to v16i8 to match PALIGNR. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, + DAG.getConstant(Rotation * Scale, MVT::i8))); + } + + // Default SSE2 implementation + int LoByteShift = 16 - Rotation * Scale; + int HiByteShift = Rotation * Scale; + + // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); + + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, + DAG.getConstant(8 * LoByteShift, MVT::i8)); + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, + DAG.getConstant(8 * HiByteShift, MVT::i8)); return DAG.getNode(ISD::BITCAST, DL, VT, - DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, - DAG.getConstant(Rotation * Scale, MVT::i8))); + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } /// \brief Compute whether each element of a shuffle is zeroable. @@ -7587,6 +7608,88 @@ return Zeroable; } +/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// byte-shift instructions. The mask must consist of a shifted sequential +/// shuffle from one of the input vectors and zeroable elements for the +/// remaining 'shifted in' elements. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Size = Mask.size(); + int Scale = 16 / Size; + + auto isSequential = [](int Base, int StartIndex, int EndIndex, int MaskOffset, + ArrayRef Mask) { + for (int i = StartIndex; i < EndIndex; i++) { + if (Mask[i] < 0) + continue; + if (i + Base != Mask[i] - MaskOffset) + return false; + } + return true; + }; + + for (int Shift = 1; Shift < Size; Shift++) { + int ByteShift = Shift * Scale; + + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + bool ZeroableRight = true; + for (int i = Size - Shift; i < Size; i++) { + ZeroableRight &= Zeroable[i]; + } + + if (ZeroableRight) { + bool ValidShiftRight1 = isSequential(Shift, 0, Size - Shift, 0, Mask); + bool ValidShiftRight2 = isSequential(Shift, 0, Size - Shift, Size, Mask); + + if (ValidShiftRight1 || ValidShiftRight2) { + // Cast the inputs to v2i64 to match PSRLDQ. + SDValue &TargetV = ValidShiftRight1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + bool ZeroableLeft = true; + for (int i = 0; i < Shift; i++) { + ZeroableLeft &= Zeroable[i]; + } + + if (ZeroableLeft) { + bool ValidShiftLeft1 = isSequential(-Shift, Shift, Size, 0, Mask); + bool ValidShiftLeft2 = isSequential(-Shift, Shift, Size, Size, Mask); + + if (ValidShiftLeft1 || ValidShiftLeft2) { + // Cast the inputs to v2i64 to match PSLLDQ. + SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + } + + return SDValue(); +} + /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension @@ -8090,10 +8193,16 @@ Subtarget, DAG)) return Blend; - // Try to use rotation instructions if available. + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v2i64, V1, V2, Mask, DAG)) + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // We implement this with SHUFPD which is pretty lame because it will likely @@ -8366,10 +8475,16 @@ Subtarget, DAG)) return Blend; - // Try to use rotation instructions if available. + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v4i32, V1, V2, Mask, DAG)) + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // We implement this with SHUFPS because it can blend from two vectors. @@ -8434,11 +8549,15 @@ if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); - // Try to use rotation instructions if available. - if (Subtarget->hasSSSE3()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V, V, Mask, DAG)) - return Rotate; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) + return Rotate; // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up @@ -9058,11 +9177,15 @@ Subtarget, DAG)) return Blend; - // Try to use rotation instructions if available. - if (Subtarget->hasSSSE3()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v8i16, V1, V2, Mask, DAG)) - return Rotate; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); @@ -9193,11 +9316,15 @@ ArrayRef OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Try to use rotation instructions if available. - if (Subtarget->hasSSSE3()) - if (SDValue Rotate = lowerVectorShuffleAsByteRotate( - DL, MVT::v16i8, V1, V2, OrigMask, DAG)) - return Rotate; + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return Rotate; // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( Index: llvm/trunk/test/CodeGen/X86/palignr.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/palignr.ll +++ llvm/trunk/test/CodeGen/X86/palignr.ll @@ -86,16 +86,9 @@ ; ; CHECK-YONAH-LABEL: test6: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,2,1,4,5,6,7] -; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 > ret <8 x i16> %C @@ -110,15 +103,9 @@ ; ; CHECK-YONAH-LABEL: test7: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 > ret <8 x i16> %C @@ -133,33 +120,9 @@ ; ; CHECK-YONAH-LABEL: test8: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: pxor %xmm3, %xmm3 -; CHECK-YONAH-NEXT: movdqa %xmm0, %xmm2 -; CHECK-YONAH-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] -; CHECK-YONAH-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,0] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7] -; CHECK-YONAH-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] -; CHECK-YONAH-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; CHECK-YONAH-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-YONAH-NEXT: packuswb %xmm0, %xmm2 -; CHECK-YONAH-NEXT: movdqa %xmm2, %xmm0 +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 > ret <16 x i8> %C @@ -178,11 +141,11 @@ ; ; CHECK-YONAH-LABEL: test9: ; CHECK-YONAH: # BB#0: -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; CHECK-YONAH-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] -; CHECK-YONAH-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; CHECK-YONAH-NEXT: por %xmm0, %xmm1 +; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 ; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 > ret <8 x i16> %C Index: llvm/trunk/test/CodeGen/X86/sse3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse3.ll +++ llvm/trunk/test/CodeGen/X86/sse3.ll @@ -8,18 +8,18 @@ define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { ; X64-LABEL: t0: ; X64: ## BB#0: ## %entry -; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: movl $1, %eax +; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq entry: %tmp3 = load <8 x i16>* %old %tmp6 = shufflevector <8 x i16> %tmp3, - <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, + <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > store <8 x i16> %tmp6, <8 x i16>* %dest ret void - } define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { Index: llvm/trunk/test/CodeGen/X86/vec_insert-5.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_insert-5.ll +++ llvm/trunk/test/CodeGen/X86/vec_insert-5.ll @@ -63,10 +63,7 @@ define <16 x i8> @t5(<16 x i8> %x) nounwind { ; CHECK-LABEL: t5: ; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1] -; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero -; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %s @@ -75,7 +72,7 @@ define <16 x i8> @t6(<16 x i8> %x) nounwind { ; CHECK-LABEL: t6: ; CHECK: # BB#0: -; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> ret <16 x i8> %s @@ -84,7 +81,7 @@ define <16 x i8> @t7(<16 x i8> %x) nounwind { ; CHECK-LABEL: t7: ; CHECK: # BB#0: -; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2] +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] ; CHECK-NEXT: retl %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> ret <16 x i8> %s Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -575,36 +575,26 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,u,3,u,u,6,7,8,9,10,11,12,13,14],zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[u,u],zero,xmm0[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,u,3,u,u,6,7,8,9,10,11,12,13,14],zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[u,u],zero,xmm0[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; AVX: # BB#0: ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,u,u,3,u,u,6,7,8,9,10,11,12,13,14],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u],zero,xmm0[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -622,7 +612,7 @@ ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: ; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] ; SSSE3-NEXT: pxor %xmm1, %xmm1 ; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15] ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -632,7 +622,7 @@ ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15] ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -642,7 +632,7 @@ ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -653,40 +643,60 @@ ret <16 x i8> %shuffle } +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSE2: # BB#0: +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> + ret <16 x i8> %shuffle +} + define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[3,1,2,0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,0,1,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: @@ -710,34 +720,10 @@ define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: @@ -761,37 +747,9 @@ define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,2,0] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,3,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: packuswb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: @@ -815,37 +773,9 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[3,1,2,0] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,3,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,7] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: @@ -871,34 +801,10 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,0] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,3,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: @@ -922,37 +828,9 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: ; SSE2: # BB#0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[3,1,2,0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,0,1,2,4,5,6,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: packuswb %xmm3, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1467,18 +1467,9 @@ define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_def01234: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,3,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_def01234: @@ -1502,12 +1493,9 @@ define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_ueuu123u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,5,7] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_ueuu123u: @@ -1531,11 +1519,10 @@ define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_56701234: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_56701234: @@ -1559,9 +1546,10 @@ define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_u6uu123u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,7] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_u6uu123u: @@ -1585,23 +1573,22 @@ define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_uuuu123u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,7] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_uuuu123u: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v8i16_uuuu123u: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_uuuu123u: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -1610,17 +1597,9 @@ define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_bcdef012: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_bcdef012: @@ -1644,12 +1623,9 @@ define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_ucdeuu1u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u: @@ -1673,11 +1649,10 @@ define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_34567012: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_34567012: @@ -1701,9 +1676,10 @@ define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_u456uu1u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_u456uu1u: @@ -1727,23 +1703,22 @@ define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_u456uuuu: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_u456uuuu: ; SSSE3: # BB#0: -; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v8i16_u456uuuu: ; SSE41: # BB#0: -; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v8i16_u456uuuu: ; AVX: # BB#0: -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -1752,17 +1727,9 @@ define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_3456789a: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_3456789a: @@ -1788,12 +1755,9 @@ define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_u456uu9u: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_u456uu9u: @@ -1819,17 +1783,9 @@ define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_56789abc: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_56789abc: @@ -1855,12 +1811,9 @@ define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: shuffle_v8i16_u6uu9abu: ; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,5,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,5,7] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v8i16_u6uu9abu: