Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -478,31 +478,33 @@ /// in-register any-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is any-extended into the corresponding, wider result - /// elements with the high bits becoming undef. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is any-extended into the corresponding, + /// wider result elements with the high bits becoming undef. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. ANY_EXTEND_VECTOR_INREG, /// SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an /// in-register sign-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is sign-extended into the corresponding, wider result - /// elements. - // FIXME: The SIGN_EXTEND_INREG node isn't specifically limited to - // scalars, but it also doesn't handle vectors well. Either it should be - // restricted to scalars or this node (and its handling) should be merged - // into it. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is sign-extended into the corresponding, + /// wider result elements. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. SIGN_EXTEND_VECTOR_INREG, /// ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an /// in-register zero-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is zero-extended into the corresponding, wider result - /// elements. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is zero-extended into the corresponding, + /// wider result elements. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. ZERO_EXTEND_VECTOR_INREG, /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -162,7 +162,7 @@ ]>; def SDTExtInvec : SDTypeProfile<1, 1, [ // sext_invec SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>, - SDTCisOpSmallerThanOp<1, 0>, SDTCisSameSizeAs<0,1> + SDTCisOpSmallerThanOp<1, 0> ]>; def SDTSetCC : SDTypeProfile<1, 3, [ // setcc Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4168,9 +4168,8 @@ case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: assert(VT.isVector() && "This DAG node is restricted to vector types."); - assert(VT.getSizeInBits() == Operand.getValueSizeInBits() && - "The sizes of the input and result must match in order to perform the " - "extend in-register."); + assert(Operand.getValueType().bitsLE(VT) && + "The input must be the same size or smaller than the result."); assert(VT.getVectorNumElements() < Operand.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -295,11 +295,6 @@ // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, - // Vector integer zero-extend. - VZEXT, - // Vector integer signed-extend. - VSEXT, - // Vector integer truncate. VTRUNC, // Vector integer truncate with unsigned/signed saturation. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1335,6 +1335,11 @@ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); + for (auto VT : {MVT::v16i32, MVT::v8i64}) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + } + // Without BWI we need to use custom lowering to handle MVT::v64i8 input. setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); @@ -1555,6 +1560,7 @@ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); @@ -5456,15 +5462,9 @@ return DAG.getBitcast(VT, Vec); } -static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, +static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); - assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); - - if (VT.is128BitVector() && InVT.is128BitVector()) - return DAG.getNode(X86ISD::VSEXT == Opc ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG, - DL, VT, In); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -5474,7 +5474,13 @@ std::max(128, (int)VT.getSizeInBits() / Scale)); } - return DAG.getNode(Opc, DL, VT, In); + if (VT.getVectorNumElements() == In.getValueType().getVectorNumElements()) + return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + DL, VT, In); + + return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG + : ISD::ZERO_EXTEND_VECTOR_INREG, + DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. @@ -6529,7 +6535,7 @@ return true; } case ISD::ZERO_EXTEND_VECTOR_INREG: - case X86ISD::VZEXT: { + case ISD::ZERO_EXTEND: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -10779,7 +10785,7 @@ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -17477,7 +17483,7 @@ "Unexpected element type"); if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VZEXT, dl, VT, In); + return Op; // Optimize vectors in AVX mode: // @@ -19788,7 +19794,6 @@ SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); - assert(VT.getSizeInBits() == InVT.getSizeInBits()); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); @@ -19809,7 +19814,7 @@ // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. - if (VT.getSizeInBits() > 128) { + if (InVT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * NumElts; @@ -19821,8 +19826,15 @@ // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); + + if (In.getValueType().getVectorNumElements() != NumElts) + return DAG.getNode(Op.getOpcode(), dl, VT, In); + + // FIXME: Apparently we create inreg operations that could be regular + // extends. unsigned ExtOpc = - Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; + Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } @@ -19904,7 +19916,7 @@ "Unexpected element type"); if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return Op; // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and @@ -19976,7 +19988,7 @@ // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar -// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during @@ -20161,9 +20173,9 @@ unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { - // If we have SSE4.1, we can directly emit a VSEXT node. + // If we have SSE4.1, we can directly emit a sext/sext_invec node. if (Subtarget.hasSSE41()) { - SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -20179,7 +20191,7 @@ if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -26571,8 +26583,6 @@ case X86ISD::LDEC: return "X86ISD::LDEC"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; - case X86ISD::VZEXT: return "X86ISD::VZEXT"; - case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; @@ -29528,23 +29538,6 @@ Known = Known.trunc(BitWidth); break; } - case X86ISD::VZEXT: { - // TODO: Add DemandedElts support. - SDValue N0 = Op.getOperand(0); - unsigned NumElts = VT.getVectorNumElements(); - - EVT SrcVT = N0.getValueType(); - unsigned InNumElts = SrcVT.getVectorNumElements(); - unsigned InBitWidth = SrcVT.getScalarSizeInBits(); - assert(InNumElts >= NumElts && "Illegal VZEXT input"); - - Known = KnownBits(InBitWidth); - APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); - DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1); - Known = Known.zext(BitWidth); - Known.Zero.setBitsFrom(InBitWidth); - break; - } case X86ISD::CMOV: { DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1); // If we don't know any bits, early out. @@ -29622,14 +29615,6 @@ // SETCC_CARRY sets the dest to ~0 for true or 0 for false. return VTBits; - case X86ISD::VSEXT: { - // TODO: Add DemandedElts support. - SDValue Src = Op.getOperand(0); - unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); - Tmp += VTBits - Src.getScalarValueSizeInBits(); - return Tmp; - } - case X86ISD::VTRUNC: { // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); @@ -29753,10 +29738,12 @@ MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); - if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) { + if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - Shuffle = unsigned(X86ISD::VZEXT); - } else + + if (SrcVT.getVectorNumElements() == NumDstElts) + Shuffle = unsigned(ISD::ZERO_EXTEND); + else Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); @@ -31939,18 +31926,6 @@ } break; } - case X86ISD::VSEXT: { - SDValue Src = Op.getOperand(0); - unsigned InBits = Src.getScalarValueSizeInBits(); - - // If none of the top bits are demanded, convert this into an any_extend. - if (OriginalDemandedBits.getActiveBits() <= InBits) - return TLO.CombineTo(Op, - TLO.DAG.getNode(X86ISD::VZEXT, SDLoc(Op), - Op.getValueType(), Src)); - - break; - } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -36628,7 +36603,7 @@ Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); + SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } @@ -40075,7 +40050,7 @@ MVT OpVT = Op.getSimpleValueType(); MVT OpEltVT = OpVT.getVectorElementType(); unsigned OpEltSizeInBits = OpEltVT.getSizeInBits(); - unsigned InputBits = OpEltSizeInBits * NumElts; + //unsigned InputBits = OpEltSizeInBits * NumElts; // Perform any constant folding. // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled. @@ -40084,69 +40059,18 @@ if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { APInt Undefs(NumElts, 0); SmallVector Vals(NumElts, APInt(EltSizeInBits, 0)); - bool IsZEXT = - (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); + bool IsSEXT = (Opcode == ISD::SIGN_EXTEND_VECTOR_INREG); for (unsigned i = 0; i != NumElts; ++i) { if (UndefElts[i]) { Undefs.setBit(i); continue; } - Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits) - : EltBits[i].sextOrTrunc(EltSizeInBits); + Vals[i] = IsSEXT ? EltBits[i].sextOrTrunc(EltSizeInBits) + : EltBits[i].zextOrTrunc(EltSizeInBits); } return getConstVector(Vals, Undefs, VT, DAG, DL); } - // (vzext (bitcast (vzext (x)) -> (vzext x) - // TODO: (vsext (bitcast (vsext (x)) -> (vsext x) - SDValue V = peekThroughBitcasts(Op); - if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) { - MVT InnerVT = V.getSimpleValueType(); - MVT InnerEltVT = InnerVT.getVectorElementType(); - - // If the element sizes match exactly, we can just do one larger vzext. This - // is always an exact type match as vzext operates on integer types. - if (OpEltVT == InnerEltVT) { - assert(OpVT == InnerVT && "Types must match for vzext!"); - return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); - } - - // The only other way we can combine them is if only a single element of the - // inner vzext is used in the input to the outer vzext. - if (InnerEltVT.getSizeInBits() < InputBits) - return SDValue(); - - // In this case, the inner vzext is completely dead because we're going to - // only look at bits inside of the low element. Just do the outer vzext on - // a bitcast of the input to the inner. - return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V)); - } - - // Check if we can bypass extracting and re-inserting an element of an input - // vector. Essentially: - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - // TODO: Add X86ISD::VSEXT support - if (Opcode == X86ISD::VZEXT && - V.getOpcode() == ISD::SCALAR_TO_VECTOR && - V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { - SDValue ExtractedV = V.getOperand(0); - SDValue OrigV = ExtractedV.getOperand(0); - if (isNullConstant(ExtractedV.getOperand(1))) { - MVT OrigVT = OrigV.getSimpleValueType(); - // Extract a subvector if necessary... - if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { - int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); - OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), - OrigVT.getVectorNumElements() / Ratio); - OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, - DAG.getIntPtrConstant(0, DL)); - } - Op = DAG.getBitcast(OpVT, OrigV); - return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); - } - } - return SDValue(); } @@ -40400,13 +40324,20 @@ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0)); } } - if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) && + if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) && OpVT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG; + unsigned ExtOp = + InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG; return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); } + if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && + OpVT.is128BitVector() && + InVec.getOperand(0).getSimpleValueType().is128BitVector()) { + return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0)); + } if (InOpcode == ISD::BITCAST) { // TODO - do this for target shuffles in general. SDValue InVecBC = peekThroughOneUseBitcasts(InVec); @@ -40531,8 +40462,7 @@ return combineVectorShiftImm(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: - case X86ISD::VSEXT: - case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); + return combineVSZext(N, DAG, DCI, Subtarget); case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -9528,7 +9528,7 @@ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { @@ -9547,12 +9547,12 @@ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { defm Z : WriteShuffle256_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; } } @@ -9585,7 +9585,7 @@ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { @@ -9615,23 +9615,107 @@ } } -defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>; + +defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>; + + +// Patterns that we also need any extend versions of. aext_vector_inreg +// is currently legalized to zext_vector_inreg. +multiclass AVX512_pmovx_patterns_base { + // 256-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + } + + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + } + + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), + (!cast(OpcPrefix#BWZrm) addr:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BDZrm) addr:$src)>; + def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), + (!cast(OpcPrefix#WDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WQZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), + (!cast(OpcPrefix#DQZrm) addr:$src)>; + } +} + +multiclass AVX512_pmovx_patterns_aext : + AVX512_pmovx_patterns_base { + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))), + (!cast(OpcPrefix#BWZ256rr) VR128X:$src)>; + } -defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>; + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))), + (!cast(OpcPrefix#WDZ256rr) VR128X:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))), + (!cast(OpcPrefix#DQZ256rr) VR128X:$src)>; + } + + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))), + (!cast(OpcPrefix#BWZrr) VR256X:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))), + (!cast(OpcPrefix#BDZrr) VR128X:$src)>; + def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))), + (!cast(OpcPrefix#WDZrr) VR256X:$src)>; + + def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))), + (!cast(OpcPrefix#WQZrr) VR128X:$src)>; + + def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))), + (!cast(OpcPrefix#DQZrr) VR256X:$src)>; + } +} multiclass AVX512_pmovx_patterns { + SDNode InVecOp> : + AVX512_pmovx_patterns_base { // 128-bit patterns let Predicates = [HasVLX, HasBWI] in { def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -9695,84 +9779,58 @@ def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; } - // 256-bit patterns - let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - } let Predicates = [HasVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; } // 512-bit patterns - let Predicates = [HasBWI] in { - def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), - (!cast(OpcPrefix#BWZrm) addr:$src)>; - } let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BDZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - - def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), - (!cast(OpcPrefix#WDZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WQZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), - (!cast(OpcPrefix#DQZrm) addr:$src)>; } } -defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>; -defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>; +defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>; +defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>; +defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>; + +// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge +// ext+trunc aggresively making it impossible to legalize the DAG to this +// pattern directly. +let Predicates = [HasAVX512, NoBWI] in { +def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; +def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), + (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +} // Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge // ext+trunc aggresively making it impossible to legalize the DAG to this Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,16 +103,6 @@ def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86vzext : SDNode<"X86ISD::VZEXT", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<1, 0>]>>; - -def X86vsext : SDNode<"X86ISD::VSEXT", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<1, 0>]>>; - def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -5198,26 +5198,64 @@ defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; -// AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns { +// Patterns that we also need for any_extend. +// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg. +multiclass SS41I_pmovx_avx2_patterns_base { // Register-Register patterns let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), - (!cast(OpcPrefix#BWYrr) VR128:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BWYrr) VR128:$src)>; + } + + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WDYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast(OpcPrefix#DQYrr) VR128:$src)>; + } + + // AVX2 Register-Memory patterns + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; } + let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + } +} + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns : + SS41I_pmovx_avx2_patterns_base { + + // Register-Register patterns + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BQYrr) VR128:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), - (!cast(OpcPrefix#WDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), (!cast(OpcPrefix#WQYrr) VR128:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), - (!cast(OpcPrefix#DQYrr) VR128:$src)>; } // Simple Register-Memory patterns @@ -5241,60 +5279,39 @@ } // AVX2 Register-Memory patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - } let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; } } -defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; -defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; +defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>; // SSE4.1/AVX patterns. multiclass SS41I_pmovx_patterns