Index: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h @@ -478,31 +478,33 @@ /// in-register any-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is any-extended into the corresponding, wider result - /// elements with the high bits becoming undef. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is any-extended into the corresponding, + /// wider result elements with the high bits becoming undef. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. ANY_EXTEND_VECTOR_INREG, /// SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an /// in-register sign-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is sign-extended into the corresponding, wider result - /// elements. - // FIXME: The SIGN_EXTEND_INREG node isn't specifically limited to - // scalars, but it also doesn't handle vectors well. Either it should be - // restricted to scalars or this node (and its handling) should be merged - // into it. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is sign-extended into the corresponding, + /// wider result elements. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. SIGN_EXTEND_VECTOR_INREG, /// ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an /// in-register zero-extension of the low lanes of an integer vector. The /// result type must have fewer elements than the operand type, and those /// elements must be larger integer types such that the total size of the - /// operand type and the result type match. Each of the low operand - /// elements is zero-extended into the corresponding, wider result - /// elements. + /// operand type is less than or equal to the size of the result type. Each + /// of the low operand elements is zero-extended into the corresponding, + /// wider result elements. + /// NOTE: The type legalizer prefers to make the operand and result size + /// the same to allow expansion to shuffle vector during op legalization. ZERO_EXTEND_VECTOR_INREG, /// FP_TO_[US]INT - Convert a floating point value to a signed or unsigned Index: llvm/trunk/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/trunk/include/llvm/Target/TargetSelectionDAG.td +++ llvm/trunk/include/llvm/Target/TargetSelectionDAG.td @@ -162,7 +162,7 @@ ]>; def SDTExtInvec : SDTypeProfile<1, 1, [ // sext_invec SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>, - SDTCisOpSmallerThanOp<1, 0>, SDTCisSameSizeAs<0,1> + SDTCisOpSmallerThanOp<1, 0> ]>; def SDTSetCC : SDTypeProfile<1, 3, [ // setcc Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4168,9 +4168,8 @@ case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::SIGN_EXTEND_VECTOR_INREG: assert(VT.isVector() && "This DAG node is restricted to vector types."); - assert(VT.getSizeInBits() == Operand.getValueSizeInBits() && - "The sizes of the input and result must match in order to perform the " - "extend in-register."); + assert(Operand.getValueType().bitsLE(VT) && + "The input must be the same size or smaller than the result."); assert(VT.getVectorNumElements() < Operand.getValueType().getVectorNumElements() && "The destination vector type must have fewer lanes than the input."); Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -295,11 +295,6 @@ // Vector move to low scalar and zero higher vector elements. VZEXT_MOVL, - // Vector integer zero-extend. - VZEXT, - // Vector integer signed-extend. - VSEXT, - // Vector integer truncate. VTRUNC, // Vector integer truncate with unsigned/signed saturation. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -1332,12 +1332,11 @@ setOperationAction(ISD::FNEARBYINT, VT, Legal); } - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom); - // Without BWI we need to use custom lowering to handle MVT::v64i8 input. - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom); + for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + } setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); @@ -1555,6 +1554,7 @@ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); @@ -5456,15 +5456,9 @@ return DAG.getBitcast(VT, Vec); } -static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, +static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG) { EVT InVT = In.getValueType(); - assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode"); - - if (VT.is128BitVector() && InVT.is128BitVector()) - return DAG.getNode(X86ISD::VSEXT == Opc ? ISD::SIGN_EXTEND_VECTOR_INREG - : ISD::ZERO_EXTEND_VECTOR_INREG, - DL, VT, In); // For 256-bit vectors, we only need the lower (128-bit) input half. // For 512-bit vectors, we only need the lower input half or quarter. @@ -5472,9 +5466,16 @@ int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); In = extractSubVector(In, 0, DAG, DL, std::max(128, (int)VT.getSizeInBits() / Scale)); + InVT = In.getValueType(); } - return DAG.getNode(Opc, DL, VT, In); + if (VT.getVectorNumElements() == In.getValueType().getVectorNumElements()) + return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, + DL, VT, In); + + return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG + : ISD::ZERO_EXTEND_VECTOR_INREG, + DL, VT, In); } /// Returns a vector_shuffle node for an unpackl operation. @@ -6529,7 +6530,7 @@ return true; } case ISD::ZERO_EXTEND_VECTOR_INREG: - case X86ISD::VZEXT: { + case ISD::ZERO_EXTEND: { // TODO - add support for VPMOVZX with smaller input vector types. SDValue Src = N.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -10880,7 +10881,7 @@ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = ShuffleOffset(InputV); - InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG); + InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG); return DAG.getBitcast(VT, InputV); } @@ -17584,7 +17585,7 @@ "Unexpected element type"); if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VZEXT, dl, VT, In); + return Op; // Optimize vectors in AVX mode: // @@ -19895,7 +19896,6 @@ SDValue In = Op->getOperand(0); MVT VT = Op->getSimpleValueType(0); MVT InVT = In.getSimpleValueType(); - assert(VT.getSizeInBits() == InVT.getSizeInBits()); MVT SVT = VT.getVectorElementType(); MVT InSVT = InVT.getVectorElementType(); @@ -19916,11 +19916,12 @@ // For 256-bit vectors, we only need the lower (128-bit) half of the input. // For 512-bit vectors, we need 128-bits or 256-bits. - if (VT.getSizeInBits() > 128) { + if (InVT.getSizeInBits() > 128) { // Input needs to be at least the same number of elements as output, and // at least 128-bits. int InSize = InSVT.getSizeInBits() * NumElts; In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); + InVT = In.getSimpleValueType(); } // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, @@ -19928,8 +19929,15 @@ // need to be handled here for 256/512-bit results. if (Subtarget.hasInt256()) { assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension"); + + if (InVT.getVectorNumElements() != NumElts) + return DAG.getNode(Op.getOpcode(), dl, VT, In); + + // FIXME: Apparently we create inreg operations that could be regular + // extends. unsigned ExtOpc = - Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? X86ISD::VSEXT : X86ISD::VZEXT; + Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, dl, VT, In); } @@ -19939,7 +19947,6 @@ int HalfNumElts = NumElts / 2; MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts); - InVT = In.getSimpleValueType(); unsigned NumSrcElts = InVT.getVectorNumElements(); SmallVector HiMask(NumSrcElts, SM_SentinelUndef); for (int i = 0; i != HalfNumElts; ++i) @@ -20011,7 +20018,7 @@ "Unexpected element type"); if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + return Op; // Optimize vectors in AVX mode // Sign extend v8i16 to v8i32 and @@ -20083,7 +20090,7 @@ // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar -// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during @@ -20268,9 +20275,9 @@ unsigned SizeRatio = RegSz / MemSz; if (Ext == ISD::SEXTLOAD) { - // If we have SSE4.1, we can directly emit a VSEXT node. + // If we have SSE4.1, we can directly emit a sext/sext_invec node. if (Subtarget.hasSSE41()) { - SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -20286,7 +20293,7 @@ if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && MemVT == MVT::v8i8) { - SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG); + SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG); return DAG.getMergeValues({Sext, TF}, dl); } @@ -26683,8 +26690,6 @@ case X86ISD::LDEC: return "X86ISD::LDEC"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; - case X86ISD::VZEXT: return "X86ISD::VZEXT"; - case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; @@ -29640,23 +29645,6 @@ Known = Known.trunc(BitWidth); break; } - case X86ISD::VZEXT: { - // TODO: Add DemandedElts support. - SDValue N0 = Op.getOperand(0); - unsigned NumElts = VT.getVectorNumElements(); - - EVT SrcVT = N0.getValueType(); - unsigned InNumElts = SrcVT.getVectorNumElements(); - unsigned InBitWidth = SrcVT.getScalarSizeInBits(); - assert(InNumElts >= NumElts && "Illegal VZEXT input"); - - Known = KnownBits(InBitWidth); - APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts); - DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1); - Known = Known.zext(BitWidth); - Known.Zero.setBitsFrom(InBitWidth); - break; - } case X86ISD::CMOV: { DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1); // If we don't know any bits, early out. @@ -29734,14 +29722,6 @@ // SETCC_CARRY sets the dest to ~0 for true or 0 for false. return VTBits; - case X86ISD::VSEXT: { - // TODO: Add DemandedElts support. - SDValue Src = Op.getOperand(0); - unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); - Tmp += VTBits - Src.getScalarValueSizeInBits(); - return Tmp; - } - case X86ISD::VTRUNC: { // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); @@ -29865,10 +29845,12 @@ MVT::getIntegerVT(MaskEltSize); SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); - if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) { + if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); - Shuffle = unsigned(X86ISD::VZEXT); - } else + + if (SrcVT.getVectorNumElements() == NumDstElts) + Shuffle = unsigned(ISD::ZERO_EXTEND); + else Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); DstVT = MVT::getIntegerVT(Scale * MaskEltSize); @@ -32051,18 +32033,6 @@ } break; } - case X86ISD::VSEXT: { - SDValue Src = Op.getOperand(0); - unsigned InBits = Src.getScalarValueSizeInBits(); - - // If none of the top bits are demanded, convert this into an any_extend. - if (OriginalDemandedBits.getActiveBits() <= InBits) - return TLO.CombineTo(Op, - TLO.DAG.getNode(X86ISD::VZEXT, SDLoc(Op), - Op.getValueType(), Src)); - - break; - } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -36740,7 +36710,7 @@ Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG); + SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); } @@ -40195,8 +40165,7 @@ if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) { APInt Undefs(NumElts, 0); SmallVector Vals(NumElts, APInt(EltSizeInBits, 0)); - bool IsZEXT = - (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); + bool IsZEXT = (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG); for (unsigned i = 0; i != NumElts; ++i) { if (UndefElts[i]) { Undefs.setBit(i); @@ -40461,13 +40430,20 @@ return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0)); } } - if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) && + if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) && OpVT.is128BitVector() && InVec.getOperand(0).getSimpleValueType().is128BitVector()) { - unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG - : ISD::SIGN_EXTEND_VECTOR_INREG; + unsigned ExtOp = + InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG + : ISD::SIGN_EXTEND_VECTOR_INREG; return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0)); } + if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG || + InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) && + OpVT.is128BitVector() && + InVec.getOperand(0).getSimpleValueType().is128BitVector()) { + return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0)); + } if (InOpcode == ISD::BITCAST) { // TODO - do this for target shuffles in general. SDValue InVecBC = peekThroughOneUseBitcasts(InVec); @@ -40592,8 +40568,7 @@ return combineVectorShiftImm(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND_VECTOR_INREG: - case X86ISD::VSEXT: - case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget); + return combineVSZext(N, DAG, DCI, Subtarget); case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -9528,7 +9528,7 @@ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { @@ -9547,12 +9547,12 @@ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { defm Z : WriteShuffle256_common, + v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; } } @@ -9585,7 +9585,7 @@ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; defm Z256: WriteShuffle256_common, + v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { @@ -9615,23 +9615,107 @@ } } -defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>; - -defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>; +defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>; + +defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>; + + +// Patterns that we also need any extend versions of. aext_vector_inreg +// is currently legalized to zext_vector_inreg. +multiclass AVX512_pmovx_patterns_base { + // 256-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWZ256rm) addr:$src)>; + } + + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQZ256rm) addr:$src)>; + } + + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), + (!cast(OpcPrefix#BWZrm) addr:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BDZrm) addr:$src)>; + def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), + (!cast(OpcPrefix#WDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WQZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), + (!cast(OpcPrefix#DQZrm) addr:$src)>; + } +} + +multiclass AVX512_pmovx_patterns_aext : + AVX512_pmovx_patterns_base { + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128X:$src))), + (!cast(OpcPrefix#BWZ256rr) VR128X:$src)>; + } + + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128X:$src))), + (!cast(OpcPrefix#WDZ256rr) VR128X:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128X:$src))), + (!cast(OpcPrefix#DQZ256rr) VR128X:$src)>; + } + + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (v32i8 VR256X:$src))), + (!cast(OpcPrefix#BWZrr) VR256X:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (v16i8 VR128X:$src))), + (!cast(OpcPrefix#BDZrr) VR128X:$src)>; + def : Pat<(v16i32 (ExtOp (v16i16 VR256X:$src))), + (!cast(OpcPrefix#WDZrr) VR256X:$src)>; + + def : Pat<(v8i64 (ExtOp (v8i16 VR128X:$src))), + (!cast(OpcPrefix#WQZrr) VR128X:$src)>; + + def : Pat<(v8i64 (ExtOp (v8i32 VR256X:$src))), + (!cast(OpcPrefix#DQZrr) VR256X:$src)>; + } +} multiclass AVX512_pmovx_patterns { + SDNode InVecOp> : + AVX512_pmovx_patterns_base { // 128-bit patterns let Predicates = [HasVLX, HasBWI] in { def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), @@ -9695,84 +9779,58 @@ def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))), (!cast(OpcPrefix#DQZ128rm) addr:$src)>; } - // 256-bit patterns - let Predicates = [HasVLX, HasBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWZ256rm) addr:$src)>; - } let Predicates = [HasVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDZ256rm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQZ256rm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQZ256rm) addr:$src)>; } // 512-bit patterns - let Predicates = [HasBWI] in { - def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))), - (!cast(OpcPrefix#BWZrm) addr:$src)>; - } let Predicates = [HasAVX512] in { - def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BDZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQZrm) addr:$src)>; - - def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))), - (!cast(OpcPrefix#WDZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WQZrm) addr:$src)>; - - def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))), - (!cast(OpcPrefix#DQZrm) addr:$src)>; } } -defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>; -defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>; +defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>; +defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>; +defm : AVX512_pmovx_patterns_aext<"VPMOVZX", anyext>; + +// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge +// ext+trunc aggresively making it impossible to legalize the DAG to this +// pattern directly. +let Predicates = [HasAVX512, NoBWI] in { +def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +def: Pat<(v16i8 (trunc (bc_v16i16 (loadv4i64 addr:$src)))), + (VPMOVDBZrr (v16i32 (VPMOVZXWDZrm addr:$src)))>; +def: Pat<(store (v16i8 (trunc (v16i16 VR256X:$src))), addr:$dst), + (VPMOVDBZmr addr:$dst, (v16i32 (VPMOVZXWDZrr VR256X:$src)))>; +} // Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge // ext+trunc aggresively making it impossible to legalize the DAG to this Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,16 +103,6 @@ def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86vzext : SDNode<"X86ISD::VZEXT", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<1, 0>]>>; - -def X86vsext : SDNode<"X86ISD::VSEXT", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<1, 0>]>>; - def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -5198,26 +5198,64 @@ defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; -// AVX2 Patterns -multiclass SS41I_pmovx_avx2_patterns { +// Patterns that we also need for any_extend. +// Any_extend_vector_inreg is currently legalized to zero_extend_vector_inreg. +multiclass SS41I_pmovx_avx2_patterns_base { // Register-Register patterns let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), - (!cast(OpcPrefix#BWYrr) VR128:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast(OpcPrefix#BWYrr) VR128:$src)>; + } + + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast(OpcPrefix#WDYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast(OpcPrefix#DQYrr) VR128:$src)>; + } + + // AVX2 Register-Memory patterns + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#BWYrm) addr:$src)>; } + let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast(OpcPrefix#DQYrm) addr:$src)>; + } +} + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns : + SS41I_pmovx_avx2_patterns_base { + + // Register-Register patterns + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), (!cast(OpcPrefix#BQYrr) VR128:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), - (!cast(OpcPrefix#WDYrr) VR128:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), (!cast(OpcPrefix#WQYrr) VR128:$src)>; - - def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), - (!cast(OpcPrefix#DQYrr) VR128:$src)>; } // Simple Register-Memory patterns @@ -5241,60 +5279,39 @@ } // AVX2 Register-Memory patterns - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#BWYrm) addr:$src)>; - } let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v8i32 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv16i8 addr:$src))), (!cast(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#WDYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))), + def : Pat<(v4i64 (InVecOp (loadv8i16 addr:$src))), (!cast(OpcPrefix#WQYrm) addr:$src)>; - - def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; - def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), - (!cast(OpcPrefix#DQYrm) addr:$src)>; } } -defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; -defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; +defm : SS41I_pmovx_avx2_patterns_base<"VPMOVZX", anyext>; // SSE4.1/AVX patterns. multiclass SS41I_pmovx_patterns