Index: lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- lib/Target/X86/InstPrinter/X86InstComments.cpp +++ lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -878,6 +878,29 @@ DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::EXTRQI: + if (MI->getOperand(2).isImm() && + MI->getOperand(3).isImm()) + DecodeEXTRQIMask(MI->getOperand(2).getImm(), + MI->getOperand(3).getImm(), + ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + break; + + case X86::INSERTQI: + if (MI->getOperand(3).isImm() && + MI->getOperand(4).isImm()) + DecodeINSERTQIMask(MI->getOperand(3).getImm(), + MI->getOperand(4).getImm(), + ShuffleMask); + + DestName = getRegName(MI->getOperand(0).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + Src2Name = getRegName(MI->getOperand(2).getReg()); + break; + case X86::PMOVZXBWrr: case X86::PMOVZXBDrr: case X86::PMOVZXBQrr: Index: lib/Target/X86/Utils/X86ShuffleDecode.h =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.h +++ lib/Target/X86/Utils/X86ShuffleDecode.h @@ -100,6 +100,14 @@ /// \brief Decode a scalar float move instruction as a shuffle mask. void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl &ShuffleMask); + +/// \brief Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask. +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl &ShuffleMask); + +/// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl &ShuffleMask); } // llvm namespace #endif Index: lib/Target/X86/Utils/X86ShuffleDecode.cpp =================================================================== --- lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -431,4 +431,78 @@ for (unsigned i = 1; i < NumElts; i++) Mask.push_back(IsLoad ? static_cast(SM_SentinelZero) : i); } + +void DecodeEXTRQIMask(int Len, int Idx, + SmallVectorImpl &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit extraction instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes + // of the lower 64-bits. The upper 64-bits are undefined. + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + Idx); + for (int i = Len; i != 8; ++i) + ShuffleMask.push_back(SM_SentinelZero); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + +void DecodeINSERTQIMask(int Len, int Idx, + SmallVectorImpl &ShuffleMask) { + // Only the bottom 6 bits are valid for each immediate. + Len &= 0x3F; + Idx &= 0x3F; + + // We can only decode this bit insertion instruction as a shuffle if both the + // length and index work with whole bytes. + if (0 != (Len % 8) || 0 != (Idx % 8)) + return; + + // A length of zero is equivalent to a bit length of 64. + if (Len == 0) + Len = 64; + + // If the length + index exceeds the bottom 64 bits the result is undefined. + if ((Len + Idx) > 64) { + ShuffleMask.append(16, SM_SentinelUndef); + return; + } + + // Convert index and index to work with bytes. + Len /= 8; + Idx /= 8; + + // INSERTQ: Extract lowest Len bytes from lower half of second source and + // insert over first source starting at Idx byte. The upper 64-bits are + // undefined. + for (int i = 0; i != Idx; ++i) + ShuffleMask.push_back(i); + for (int i = 0; i != Len; ++i) + ShuffleMask.push_back(i + 16); + for (int i = Idx + Len; i != 8; ++i) + ShuffleMask.push_back(i); + for (int i = 8; i != 16; ++i) + ShuffleMask.push_back(SM_SentinelUndef); +} + } // llvm namespace Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -210,7 +210,7 @@ FDIV_RND, FMAX_RND, FMIN_RND, - + // Integer add/sub with unsigned saturation. ADDUS, SUBUS, @@ -247,6 +247,9 @@ /// in order to obtain suitable precision. FRSQRT, FRCP, + /// SSE4A Extraction and Insertion. + EXTRQI, INSERTQI, + // Thread Local Storage. TLSADDR, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -3882,6 +3882,15 @@ return Subtarget->hasLZCNT(); } +/// isUndefInRange - Return true if every element in Mask, beginning +/// from position Pos and ending in Pos+Size is undef. +static bool isUndefInRange(ArrayRef Mask, unsigned Pos, unsigned Size) { + for (unsigned i = Pos, e = Pos + Size; i != e; ++i) + if (0 <= Mask[i]) + return false; + return true; +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -6858,6 +6867,136 @@ return SDValue(); } +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + assert(!Zeroable.all() && "Fully zeroable shuffle mask"); + + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return SDValue(); + + // EXTRQ: Extract Len elements from lower half of source, starting at Idx. + // Remainder of lower half result is zero and upper half is all undef. + auto LowerAsEXTRQ = [&]() { + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len >= 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); + + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; + + // All mask elements must be in the lower half. + if (M > HalfSize) + return SDValue(); + + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; + } + return SDValue(); + } + + if (Idx < 0) + return SDValue(); + + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + }; + + if (SDValue ExtrQ = LowerAsEXTRQ()) + return ExtrQ; + + // INSERTQ: Extract lowest Len elements from lower half of second source and + // insert over first source, starting at Idx. + // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } + auto LowerAsInsertQ = [&]() { + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; + + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } + + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; + + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } + + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { + /* EMPTY */ + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { + Base = V1; + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { + Base = V2; + } else { + continue; + } + + // We may not have a base (first source) - this can safely be undefined. + if (!Base) + Base = DAG.getUNDEF(VT); + + int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + } + } + + return SDValue(); + }; + + if (SDValue InsertQ = LowerAsInsertQ()) + return InsertQ; + + return SDValue(); +} + /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension @@ -6865,7 +7004,7 @@ /// features of the subtarget. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, - const X86Subtarget *Subtarget, SelectionDAG &DAG) { + ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int NumElements = VT.getVectorNumElements(); int EltBits = VT.getScalarSizeInBits(); @@ -6902,6 +7041,28 @@ getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); } + // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes + // to 64-bits. + if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { + assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); + assert(VT.getSizeInBits() == 128 && "Unexpected vector width!"); + + SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(0, DL, MVT::i8))); + if (isUndefInRange(Mask, NumElements/2, NumElements/2)) + return DAG.getNode(ISD::BITCAST, DL, VT, Lo); + + SDValue Hi = + DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(EltBits, DL, MVT::i8))); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); + } + // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. @@ -6992,7 +7153,7 @@ return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, AnyExt, InputV, Subtarget, DAG); + DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -8519,6 +8680,11 @@ lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, DAG)) return Shift; + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG)) + return V; + // There are special ways we can lower some single-element blends. if (NumV2Inputs == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, @@ -8671,6 +8837,11 @@ DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return ZExt; + // See if we can use SSE4A Extraction / Insertion. + if (Subtarget->hasSSE4A()) + if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return V; + int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); @@ -14993,6 +15164,9 @@ case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_4OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4)); case INTR_TYPE_1OP_MASK_RM: { SDValue Src = Op.getOperand(1); SDValue Src0 = Op.getOperand(2); @@ -18132,6 +18306,8 @@ case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; + case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; + case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -207,6 +207,14 @@ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>]>>; +def X86extrqi : SDNode<"X86ISD::EXTRQI", + SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>; +def X86insertqi : SDNode<"X86ISD::INSERTQI", + SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i8>, + SDTCisVT<4, i8>]>>; + // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get // translated into one of the target nodes below during lowering. // Note: this is a work in progress... Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -7770,7 +7770,7 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", - [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len, + [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, imm:$idx))]>, PD; def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), @@ -7781,8 +7781,8 @@ def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", - [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src, - VR128:$src2, imm:$len, imm:$idx))]>, XD; + [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, + imm:$len, imm:$idx))]>, XD; def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), "insertq\t{$mask, $src|$src, $mask}", Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -19,7 +19,7 @@ enum IntrinsicType { INTR_NO_TYPE, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, @@ -757,6 +757,8 @@ X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), + X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), Index: test/CodeGen/X86/vector-shuffle-sse4a.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-sse4a.ll +++ test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -0,0 +1,221 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver1 | FileCheck %s --check-prefix=ALL --check-prefix=BTVER1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=BTVER2 + +; +; EXTRQI +; + +define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu: +; BTVER1: # BB#0: +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm0, %xmm1 +; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu: +; BTVER1: # BB#0: +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) { +; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm0, %xmm1 +; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) { +; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %s +} + +define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) { +; ALL-LABEL: shuf_1zzzuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) { +; ALL-LABEL: shuf_12zzuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) { +; ALL-LABEL: shuf_012zuuuu: +; ALL: # BB#0: +; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) { +; BTVER1-LABEL: shuf_0zzz1zzz: +; BTVER1: # BB#0: +; BTVER1-NEXT: movaps %xmm0, %xmm1 +; BTVER1-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] +; BTVER1-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0zzz1zzz: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; BTVER2-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> + ret <8 x i16> %s +} + +define <4 x i32> @shuf_0z1z(<4 x i32> %a0) { +; BTVER1-LABEL: shuf_0z1z: +; BTVER1: # BB#0: +; BTVER1-NEXT: pxor %xmm1, %xmm1 +; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; BTVER1-NEXT: retq +; +; BTVER2-LABEL: shuf_0z1z: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; BTVER2-NEXT: retq + %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %s +} + +; +; INSERTQI +; + +define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + ret <16 x i8> %s +} + +define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) { +; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + ret <16 x i8> %s +} + +define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0823uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0183uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0128uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_0893uuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_089Auuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + ret <8 x i16> %s +} + +define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) { +; ALL-LABEL: shuf_089uuuuu: +; ALL: # BB#0: +; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u] +; ALL-NEXT: retq + %s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + ret <8 x i16> %s +}