Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -160,6 +160,10 @@ SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>, SDTCisVTSmallerThanOp<2, 1> ]>; +def SDTExtInvec : SDTypeProfile<1, 1, [ // sext_invec + SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>, + SDTCisSameSizeAs<0,1> +]>; def SDTSetCC : SDTypeProfile<1, 3, [ // setcc SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT> @@ -406,6 +410,9 @@ [SDNPCommutative, SDNPAssociative]>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; +def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>; +def zext_invec : SDNode<"ISD::ZERO_EXTEND_VECTOR_INREG", SDTExtInvec>; + def bitreverse : SDNode<"ISD::BITREVERSE" , SDTIntUnaryOp>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2419,6 +2419,20 @@ } break; } + case ISD::ZERO_EXTEND_VECTOR_INREG: { + EVT InVT = Op.getOperand(0).getValueType(); + unsigned InBits = InVT.getScalarSizeInBits(); + APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - InBits); + KnownZero = KnownZero.trunc(InBits); + KnownOne = KnownOne.trunc(InBits); + computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, + DemandedElts.zext(InVT.getVectorNumElements()), + Depth + 1); + KnownZero = KnownZero.zext(BitWidth); + KnownOne = KnownOne.zext(BitWidth); + KnownZero |= NewBits; + break; + } case ISD::ZERO_EXTEND: { EVT InVT = Op.getOperand(0).getValueType(); unsigned InBits = InVT.getScalarSizeInBits(); @@ -2859,6 +2873,7 @@ } case ISD::SIGN_EXTEND: + case ISD::SIGN_EXTEND_VECTOR_INREG: Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits(); return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -923,6 +923,14 @@ // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal); + + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal); + for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); @@ -5133,6 +5141,26 @@ return DAG.getBitcast(VT, Vec); } +static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, + SelectionDAG &DAG) { + EVT InVT = In.getValueType(); + assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); + + if (VT.is128BitVector() && InVT.is128BitVector()) + return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT) + : DAG.getZeroExtendVectorInReg(In, DL, VT); + + // For 256-bit vectors, we only need the lower (128-bit) input half. + // For 512-bit vectors, we only need the lower input half or quarter. + if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) { + int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); + In = extractSubVector(In, 0, DAG, DL, + std::max(128, (int)VT.getSizeInBits() / Scale)); + } + + return DAG.getNode(Opc, DL, VT, In); +} + /// Generate unpacklo/unpackhi shuffle mask. static void createUnpackShuffleMask(MVT VT, SmallVectorImpl &Mask, bool Lo, bool Unary) { @@ -5849,6 +5877,7 @@ } return true; } + case ISD::ZERO_EXTEND_VECTOR_INREG: case X86ISD::VZEXT: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); @@ -9211,14 +9240,7 @@ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - - // For 256-bit vectors, we only need the lower (128-bit) input half. - // For 512-bit vectors, we only need the lower input half or quarter. - if (VT.getSizeInBits() > 128) - InputV = extractSubVector(InputV, 0, DAG, DL, - std::max(128, (int)VT.getSizeInBits() / Scale)); - - InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV); + InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -15643,7 +15665,7 @@ // word to byte only under BWI if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8 return DAG.getNode(X86ISD::VTRUNC, DL, VT, - DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG)); return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } @@ -17622,8 +17644,8 @@ if (VT.is512BitVector() && InVTElt != MVT::i1 && (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) - return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG); + return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG); } if (InVTElt != MVT::i1) @@ -17635,7 +17657,7 @@ SDValue V; if (Subtarget.hasDQI()) { - V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In); + V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG); assert(!VT.is512BitVector() && "Unexpected vector type"); } else { SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl); @@ -17690,8 +17712,10 @@ // SSE41 targets can use the pmovsx* instructions directly. unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; - if (Subtarget.hasSSE41()) + if (Subtarget.hasSSE41()) { + assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); return DAG.getNode(ExtOpc, dl, VT, In); + } // We should only get here for sign extend. assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && @@ -17776,8 +17800,8 @@ MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements() / 2); - OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); - OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi); + OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT); + OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } @@ -18092,7 +18116,7 @@ if (Ext == ISD::SEXTLOAD) { // If we have SSE4.1, we can directly emit a VSEXT node. if (Subtarget.hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Sext; } @@ -18763,11 +18787,11 @@ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { ShAmt = ShAmt.getOperand(0); ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); - ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt); + ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { SmallVector ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; @@ -21058,8 +21082,8 @@ // Extract the lo parts and sign extend to i16 SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A); - BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B); + ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT); + BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -21078,8 +21102,8 @@ -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi); - BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi); + AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT); + BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -21240,8 +21264,8 @@ DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } - SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A); - SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B); + SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG); + SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, DAG.getConstant(8, dl, MVT::v16i16)); @@ -21257,8 +21281,8 @@ // Extract the lo parts and zero/sign extend to i16. SDValue ALo, BLo; if (Subtarget.hasSSE41()) { - ALo = DAG.getNode(ExSSE41, dl, ExVT, A); - BLo = DAG.getNode(ExSSE41, dl, ExVT, B); + ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG); + BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG); } else { const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7}; @@ -21277,8 +21301,8 @@ -1, -1, -1, -1, -1, -1, -1, -1}; AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask); BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask); - AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi); - BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi); + AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG); + BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG); } else { const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15}; @@ -26455,7 +26479,7 @@ unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match against a VZEXT instruction. + // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction. // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { @@ -26474,7 +26498,8 @@ V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); - Shuffle = X86ISD::VZEXT; + Shuffle = (SrcVT != MaskVT ? X86ISD::VZEXT + : ISD::ZERO_EXTEND_VECTOR_INREG); return true; } } @@ -32171,7 +32196,7 @@ Mld->getBasePtr(), NewMask, WideSrc0, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -7481,11 +7481,11 @@ } multiclass avx512_extend_BW opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasBWI] in { defm Z128: avx512_extend_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128; defm Z256: avx512_extend_common opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common, + v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128; defm Z256: avx512_extend_common opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common, + v16i8x_info, i16mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128; defm Z256: avx512_extend_common opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common, + v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128; defm Z256: avx512_extend_common opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common, + v8i16x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128; defm Z256: avx512_extend_common opc, string OpcodeStr, - SDPatternOperator OpNode, + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,PatFrag LdFrag = !cast(ExtTy#"extloadvi32")> { let Predicates = [HasVLX, HasAVX512] in { defm Z128: avx512_extend_common, + v4i32x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; defm Z256: avx512_extend_common; -defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">; -defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">; -defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">; -defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">; -defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">; - -defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">; -defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">; -defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">; -defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">; -defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">; -defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">; +defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">; +defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">; +defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">; +defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">; +defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">; +defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">; + +defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">; +defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">; +defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">; +defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">; +defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">; +defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">; // EXTLOAD patterns, implemented using vpmovz multiclass avx512_ext_lowering; } -multiclass AVX512_pmovx_patterns { +multiclass AVX512_pmovx_patterns { // 128-bit patterns let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; - def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BWZ128rm) addr:$src)>; } let Predicates = [HasVLX] in { - def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BDZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#BQZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WDZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#WQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; - def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; } // 256-bit patterns @@ -7791,8 +7791,8 @@ } } -defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; -defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; +defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>; +defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>; //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -5963,12 +5963,12 @@ } } -defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>; -defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>; +defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>; +defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>; let Predicates = [UseSSE41] in { - defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>; - defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>; + defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>; } //===----------------------------------------------------------------------===// Index: test/CodeGen/X86/2011-10-19-widen_vselect.ll =================================================================== --- test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -27,8 +27,6 @@ ; X32: # BB#0: # %entry ; X32-NEXT: movaps %xmm0, %xmm2 ; X32-NEXT: cmpordps %xmm0, %xmm0 -; X32-NEXT: pmovsxdq %xmm0, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X32-NEXT: pslld $31, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; X32-NEXT: extractps $1, %xmm1, (%eax) @@ -39,8 +37,6 @@ ; X64: # BB#0: # %entry ; X64-NEXT: movaps %xmm0, %xmm2 ; X64-NEXT: cmpordps %xmm0, %xmm0 -; X64-NEXT: pmovsxdq %xmm0, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: pslld $31, %xmm0 ; X64-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; X64-NEXT: movlps %xmm1, (%rax) @@ -82,8 +78,6 @@ ; X32-NEXT: cvtdq2ps %xmm0, %xmm1 ; X32-NEXT: xorps %xmm0, %xmm0 ; X32-NEXT: cmpltps %xmm2, %xmm0 -; X32-NEXT: pmovsxdq %xmm0, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X32-NEXT: pslld $31, %xmm0 ; X32-NEXT: movaps {{.*#+}} xmm3 = <1,1,u,u> ; X32-NEXT: addps %xmm1, %xmm3 Index: test/CodeGen/X86/2011-10-21-widen-cmp.ll =================================================================== --- test/CodeGen/X86/2011-10-21-widen-cmp.ll +++ test/CodeGen/X86/2011-10-21-widen-cmp.ll @@ -9,8 +9,6 @@ ; CHECK: # BB#0: # %entry ; CHECK-NEXT: movaps %xmm0, %xmm2 ; CHECK-NEXT: cmpordps %xmm0, %xmm0 -; CHECK-NEXT: pmovsxdq %xmm0, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: pslld $31, %xmm0 ; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; CHECK-NEXT: movlps %xmm1, (%rax) Index: test/CodeGen/X86/combine-shl.ll =================================================================== --- test/CodeGen/X86/combine-shl.ll +++ test/CodeGen/X86/combine-shl.ll @@ -243,11 +243,11 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { ; SSE-LABEL: combine_vec_shl_zext_lshr0: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_shl_zext_lshr0: @@ -270,15 +270,15 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psrlw $4, %xmm0 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlw $2, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlw $1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3,4],xmm1[5,6],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0 ; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 ; SSE-NEXT: retq Index: test/CodeGen/X86/known-bits-vector.ll =================================================================== --- test/CodeGen/X86/known-bits-vector.ll +++ test/CodeGen/X86/known-bits-vector.ll @@ -83,15 +83,15 @@ ; X32-LABEL: knownbits_mask_shuffle_sext: ; X32: # BB#0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_sext: ; X64: # BB#0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -103,15 +103,15 @@ ; X32-LABEL: knownbits_mask_shuffle_shuffle_sext: ; X32: # BB#0: ; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_shuffle_sext: ; X64: # BB#0: ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq %1 = and <8 x i16> %a0, %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1157,15 +1157,15 @@ ; ; SSE41-LABEL: mul_v4i64_zero_upper: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE41-NEXT: pmuludq %xmm0, %xmm1 -; SSE41-NEXT: pmuludq %xmm4, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] -; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm3, %xmm0 +; SSE41-NEXT: pmuludq %xmm2, %xmm4 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v4i64_zero_upper: @@ -1219,21 +1219,21 @@ ; ; SSE41-LABEL: mul_v4i64_zero_upper_left: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pmuludq %xmm2, %xmm3 -; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 -; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: paddq %xmm3, %xmm2 ; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: pmuludq %xmm1, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm1 ; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: psllq $32, %xmm1 ; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: pmuludq %xmm2, %xmm1 +; SSE41-NEXT: psrlq $32, %xmm2 +; SSE41-NEXT: pmuludq %xmm3, %xmm2 +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1288,17 +1288,16 @@ ; ; SSE41-LABEL: mul_v4i64_zero_lower: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: pmuludq %xmm1, %xmm0 +; SSE41-NEXT: psllq $32, %xmm0 ; SSE41-NEXT: psrlq $32, %xmm2 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 +; SSE41-NEXT: pmuludq %xmm3, %xmm2 ; SSE41-NEXT: psllq $32, %xmm2 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: pmuludq %xmm1, %xmm3 -; SSE41-NEXT: psllq $32, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] -; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v4i64_zero_lower: @@ -1358,23 +1357,24 @@ ; ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pxor %xmm6, %xmm6 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE41-NEXT: pmuludq %xmm1, %xmm3 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 -; SSE41-NEXT: pmuludq %xmm7, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero +; SSE41-NEXT: pmuludq %xmm7, %xmm1 +; SSE41-NEXT: pmuludq %xmm6, %xmm2 +; SSE41-NEXT: pmuludq %xmm5, %xmm0 ; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] -; SSE41-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] -; SSE41-NEXT: movaps %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; ; AVX2-LABEL: mul_v8i64_zero_upper: Index: test/CodeGen/X86/vec_cast2.ll =================================================================== --- test/CodeGen/X86/vec_cast2.ll +++ test/CodeGen/X86/vec_cast2.ll @@ -48,10 +48,10 @@ ; CHECK-LABEL: foo2_8: ; CHECK: ## BB#0: ; CHECK-NEXT: vpand LCPI2_0, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl ; Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -2032,10 +2032,10 @@ ; ; AVX1-LABEL: uitofp_8i16_to_4f32: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX1-NEXT: vzeroupper @@ -2456,10 +2456,10 @@ ; ; AVX1-LABEL: uitofp_8i16_to_8f32: ; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3021,8 +3021,8 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -3037,8 +3037,8 @@ ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero ; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLDQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %ld = load <2 x i16>, <2 x i16> *%a @@ -3076,7 +3076,8 @@ ; AVX512VL-LABEL: uitofp_load_2i8_to_2f64: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u],zero,zero,zero,xmm0[u],zero,zero,zero +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -3091,7 +3092,8 @@ ; AVX512VLDQ-LABEL: uitofp_load_2i8_to_2f64: ; AVX512VLDQ: # BB#0: ; AVX512VLDQ-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u],zero,zero,zero,xmm0[u],zero,zero,zero +; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VLDQ-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %ld = load <2 x i8>, <2 x i8> *%a Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -55,18 +55,18 @@ ; ; SSE41-LABEL: zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i8_to_16i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i8_to_16i16: @@ -110,25 +110,27 @@ ; ; SSE41-LABEL: zext_32i8_to_32i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_32i8_to_32i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -515,18 +517,18 @@ ; ; SSE41-LABEL: zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i16_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i16_to_8i32: @@ -570,25 +572,27 @@ ; ; SSE41-LABEL: zext_16i16_to_16i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_16i16_to_16i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -812,18 +816,18 @@ ; ; SSE41-LABEL: zext_4i32_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_4i32_to_4i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_4i32_to_4i64: @@ -867,25 +871,27 @@ ; ; SSE41-LABEL: zext_8i32_to_8i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa %xmm5, %xmm0 +; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i32_to_8i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -1523,20 +1529,20 @@ ; ; SSE41-LABEL: zext_8i8_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: zext_8i8_to_8i32: ; AVX1: # BB#0: # %entry ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i8_to_8i32: