Index: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -253,6 +253,11 @@ return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -2592,6 +2597,20 @@ N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + assert(Root && P && "Unknown root/parent nodes"); + if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -4234,13 +4253,14 @@ auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { // Look through single use bitcasts. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) - Src = Src.getOperand(0); - - if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { Parent = Src.getNode(); Src = Src.getOperand(0); - if (Src.getSimpleValueType() == CmpSVT) + } + + if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { + auto *MemIntr = cast(Src); + if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) return Src; } @@ -4252,17 +4272,18 @@ bool FoldedBCast = false; if (!FoldedLoad && CanFoldLoads && (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { - SDNode *ParentNode = nullptr; + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); } // Try the other operand. if (!FoldedBCast) { + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedBCast) std::swap(Src0, Src1); } @@ -4332,7 +4353,7 @@ // Update the chain. ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -615,6 +615,9 @@ // extract_vector_elt, store. VEXTRACT_STORE, + // scalar broadcast from memory + VBROADCAST_LOAD, + // Store FP control world into i16 memory. FNSTCW16m, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -6130,6 +6130,37 @@ } } + if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && + EltSizeInBits <= VT.getScalarSizeInBits()) { + auto *MemIntr = cast(Op); + if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) + return false; + + SDValue Ptr = MemIntr->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *CNode = dyn_cast(Ptr); + if (!CNode || CNode->isMachineConstantPoolEntry() || + CNode->getOffset() != 0) + return false; + + if (const Constant *C = CNode->getConstVal()) { + unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); + if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { + if (UndefSrcElts[0]) + UndefSrcElts.setBits(0, NumSrcElts); + SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); + return CastBitData(UndefSrcElts, SrcEltBits); + } + } + } + // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { SmallVector SubEltBits; @@ -28582,6 +28613,7 @@ case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -33347,6 +33379,19 @@ if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // vbroadcast(scalarload X) -> vbroadcast_load X + if (!SrcVT.isVector() && Src.hasOneUse() && + ISD::isNormalLoad(Src.getNode())) { + LoadSDNode *LN = cast(Src); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + return BcastLd; + } + // Share broadcast with the longest vector and extract low subvector (free). for (SDNode *User : Src->uses()) if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && @@ -33512,17 +33557,23 @@ DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } - // If we're inserting an element from a vbroadcast of a load, fold the + // If we're inserting an element from a vbroadcast load, fold the // load into the X86insertps instruction. We need to convert the scalar // load to a vector and clear the source lane of the INSERTPS control. - if (Op1.getOpcode() == X86ISD::VBROADCAST && Op1.hasOneUse() && - Op1.getOperand(0).hasOneUse() && - !Op1.getOperand(0).getValueType().isVector() && - ISD::isNormalLoad(Op1.getOperand(0).getNode())) - return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, - Op1.getOperand(0)), - DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { + auto *MemIntr = cast(Op1); + if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { + SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getMemOperand()); + SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, + Load), + DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Insert; + } + } return SDValue(); } @@ -35851,6 +35902,23 @@ return DAG.getBitcast(VT, SrcOp); } + // If we're extracting a single element from a broadcast load and there are + // no other users, just create a single load. + if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { + auto *MemIntr = cast(SrcBC); + unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); + if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && + VT.getSizeInBits() == SrcBCWidth) { + SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getPointerInfo(), + MemIntr->getAlignment(), + MemIntr->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Load; + } + } + // Handle extract(truncate(x)) for 0'th index. // TODO: Treat this as a faux shuffle? // TODO: When can we use this for general indices? @@ -43893,6 +43961,21 @@ if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); + // If this is a broadcast load inserted into an upper undef, use a larger + // broadcast load. + if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() && + SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast(SubVec); + SDVTList Tys = DAG.getVTList(OpVT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + return SDValue(); } @@ -44065,6 +44148,20 @@ InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); + if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) { + auto *MemIntr = cast(InVec); + if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + } + // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -74,6 +74,7 @@ PatFrag AlignedLdFrag = !cast("alignedload" # VTName); PatFrag ScalarLdFrag = !cast("load" # EltVT); + PatFrag BroadcastLdFrag = !cast("X86VBroadcastld" # EltSizeName); ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), !cast("sse_load_f32"), @@ -1124,7 +1125,8 @@ X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, bit IsConvertibleToThreeAddress, - SDPatternOperator UnmaskedOp = X86VBroadcast> { + SDPatternOperator UnmaskedOp = X86VBroadcast, + SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> { let hasSideEffects = 0 in def r : AVX512PI, T8PD, EVEX, EVEX_CD8, Sched<[SchedRM]>; @@ -1182,7 +1184,7 @@ (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast (SrcInfo.ScalarLdFrag addr:$src))))), + (SrcInfo.BroadcastLdFrag addr:$src)))), MaskInfo.ImmAllZerosV))], DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, EVEX_CD8, Sched<[SchedRM]>; @@ -1199,7 +1201,7 @@ (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast (SrcInfo.ScalarLdFrag addr:$src))))), + (SrcInfo.BroadcastLdFrag addr:$src)))), MaskInfo.RC:$src0))], DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, EVEX_CD8, Sched<[SchedRM]>; @@ -1394,6 +1396,10 @@ // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZm addr:$src)>; } let Predicates = [HasVLX] in { @@ -1402,6 +1408,12 @@ (VPBROADCASTQZ128m addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ128m addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1422,6 +1434,12 @@ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ256m addr:$src)>; } let Predicates = [HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1434,6 +1452,10 @@ def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -1669,12 +1691,12 @@ let Predicates = [HasDQI] in defm Z : avx512_broadcast_rm_split, + _Src.info512, _Src.info128, 0, null_frag, null_frag>, EVEX_V512; let Predicates = [HasDQI, HasVLX] in defm Z256 : avx512_broadcast_rm_split, + _Src.info256, _Src.info128, 0, null_frag, null_frag>, EVEX_V256; } @@ -1685,7 +1707,7 @@ let Predicates = [HasDQI, HasVLX] in defm Z128 : avx512_broadcast_rm_split, + _Src.info128, _Src.info128, 0, null_frag, null_frag>, EVEX_V128; } @@ -1753,7 +1775,7 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src2, - IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1830,7 +1852,7 @@ def : Pat<(_.VT (vselect _.KRCWM:$mask, (X86VPermt2 _.RC:$src2, (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), - (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.BroadcastLdFrag addr:$src3)), (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), (!cast(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; @@ -1869,7 +1891,7 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2166,7 +2188,7 @@ !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, + (_.BroadcastLdFrag addr:$src2)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512BI, + (_.BroadcastLdFrag addr:$src2))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2345,8 +2366,7 @@ "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmibk : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag_su:$cc (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmibk") _.KRCWM:$mask, _.RC:$src1, addr:$src2, @@ -2551,10 +2569,10 @@ "$cc, ${src2}"#_.BroadcastStr#", $src1", "$src1, ${src2}"#_.BroadcastStr#", $cc", (X86cmpm (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc), (X86cmpm_su (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), timm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -2571,13 +2589,12 @@ _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), timm:$cc), (!cast(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), timm:$cc)), (!cast(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, @@ -2721,8 +2738,7 @@ _.BroadcastStr##", $dst|$dst, ${src1}" ##_.BroadcastStr##", $src2}", [(set _.KRC:$dst,(X86Vfpclass - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), + (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2)))]>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512, EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4589,8 +4604,7 @@ "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (_.BroadcastLdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4716,8 +4730,7 @@ "${src2}"##_Brdct.BroadcastStr##", $src1", "$src1, ${src2}"##_Brdct.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Brdct.VT (X86VBroadcast - (_Brdct.ScalarLdFrag addr:$src2))))))>, + (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4789,8 +4802,7 @@ "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast - (_Src.ScalarLdFrag addr:$src2))))))>, + (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5149,15 +5161,13 @@ X86VectorVTInfo IntInfo> { // Register-broadcast logical operations. def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), + (bitconvert (_.VT (_.BroadcastLdFrag addr:$src2))))), (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (_.BroadcastLdFrag addr:$src2)))))), _.RC:$src0)), (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -5165,8 +5175,7 @@ (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (_.BroadcastLdFrag addr:$src2)))))), _.ImmAllZerosV)), (!cast(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -5447,8 +5456,7 @@ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5578,8 +5586,7 @@ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5752,7 +5759,7 @@ defm mbi : AVX512_maskable, + (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>, EVEX_B, Sched<[sched.Folded]>; } @@ -5936,8 +5943,7 @@ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))>, + (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6211,8 +6217,7 @@ "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (Ctrl.VT (X86VBroadcast - (Ctrl.ScalarLdFrag addr:$src2)))))>, + (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6402,7 +6407,7 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, + _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6476,7 +6481,7 @@ OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6554,7 +6559,7 @@ (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, - (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6947,7 +6952,7 @@ OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -7487,14 +7492,13 @@ OpcodeStr, "${src}"##Broadcast, "${src}"##Broadcast, (_.VT (OpNode (_Src.VT - (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + (_Src.BroadcastLdFrag addr:$src)) )), (vselect MaskRC:$mask, (_.VT (OpNode (_Src.VT - (X86VBroadcast - (_Src.ScalarLdFrag addr:$src))))), + (_Src.BroadcastLdFrag addr:$src)))), _.RC:$src0), vselect, "$src0 = $dst">, EVEX, EVEX_B, Sched<[sched.Folded]>; @@ -7629,14 +7633,14 @@ v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZrmb addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), (v8f32 VR256X:$src0)), (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; } @@ -7660,14 +7664,14 @@ v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; - def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZ256rmb addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), VR128X:$src0), (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; @@ -7691,12 +7695,12 @@ VK2WM:$mask), (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))), + def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))), (VCVTPD2PSZ128rmb addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8177,12 +8181,12 @@ VK2WM:$mask), (VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8206,12 +8210,12 @@ VK2WM:$mask), (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8235,12 +8239,12 @@ VK2WM:$mask), (VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8264,12 +8268,12 @@ VK2WM:$mask), (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8402,12 +8406,12 @@ VK2WM:$mask), (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8431,12 +8435,12 @@ VK2WM:$mask), (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTUQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8748,7 +8752,7 @@ (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8842,7 +8846,7 @@ (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8923,7 +8927,7 @@ (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (fsqrt (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10103,7 +10107,7 @@ (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", - (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10166,7 +10170,7 @@ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), (i32 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10209,7 +10213,7 @@ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10403,7 +10407,7 @@ (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), (i8 timm:$src3)))))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10489,7 +10493,7 @@ OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (X86VAlign _.RC:$src1, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10581,8 +10585,7 @@ SDNodeXForm ImmXForm> : avx512_vpalign_mask_lowering { def : Pat<(From.VT (OpNode From.RC:$src1, - (bitconvert (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), + (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))), timm:$src3)), (!cast(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; @@ -10591,8 +10594,7 @@ (bitconvert (From.VT (OpNode From.RC:$src1, (bitconvert - (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), + (To.VT (To.BroadcastLdFrag addr:$src2))), timm:$src3))), To.RC:$src0)), (!cast(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, @@ -10603,8 +10605,7 @@ (bitconvert (From.VT (OpNode From.RC:$src1, (bitconvert - (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), + (To.VT (To.BroadcastLdFrag addr:$src2))), timm:$src3))), To.ImmAllZerosV)), (!cast(OpcodeStr#"rmbikz") To.KRCWM:$mask, @@ -10667,8 +10668,7 @@ (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, - (_.VT (OpNode (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1)))))>, + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; } @@ -10811,16 +10811,16 @@ // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_movddup_128 opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable, EVEX, + (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX, Sched<[sched]>; defm rm : AVX512_maskable, + (_.VT (_.BroadcastLdFrag addr:$src))>, EVEX, EVEX_CD8<_.EltSize, CD8VH>, Sched<[sched.Folded]>; } @@ -10834,7 +10834,7 @@ let Predicates = [HasAVX512, HasVLX] in { defm Z256 : avx512_unary_rm, EVEX_V256; - defm Z128 : avx512_movddup_128, EVEX_V128; } } @@ -10863,10 +10863,10 @@ immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; @@ -11207,7 +11207,7 @@ "$src2, ${src3}"##_.BroadcastStr##", $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), (i8 timm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -11285,12 +11285,12 @@ _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>; // Additional patterns for matching broadcasts in other positions. - def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4))), (!cast(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4))), (!cast(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; @@ -11298,7 +11298,7 @@ // Additional patterns for matching zero masking with broadcasts in other // positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast(Name#_.ZSuffix#rmbikz) _.RC:$src1, @@ -11306,7 +11306,7 @@ (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast(Name#_.ZSuffix#rmbikz) _.RC:$src1, @@ -11316,33 +11316,32 @@ // Additional patterns for matching masked broadcasts with different // operand orders. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, @@ -11371,61 +11370,61 @@ // FIXME: Need better DAG canonicalization. let Predicates = [HasVLX] in { def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), VR128X:$src2, (i8 timm:$src4)), (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), VR128X:$src2, (i8 timm:$src4)), (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), VR256X:$src2, (i8 timm:$src4)), (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), VR256X:$src2, (i8 timm:$src4)), (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; @@ -11433,31 +11432,31 @@ let Predicates = [HasAVX512] in { def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), VR512:$src2, VR512:$src1, (i8 timm:$src4)), (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), VR512:$src2, (i8 timm:$src4)), (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), VR512:$src2, VR512:$src1, (i8 timm:$src4)), (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), VR512:$src2, (i8 timm:$src4)), (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; @@ -11696,7 +11695,7 @@ "$src2, ${src3}"##_.BroadcastStr##", $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), + (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)), (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Constraints = "$src1 = $dst" @@ -11987,7 +11986,7 @@ "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12084,8 +12083,7 @@ OpStr, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast - (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12221,7 +12219,7 @@ OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", (OpNode (VTI.VT VTI.RC:$src1), - (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))), + (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))), (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12333,7 +12331,7 @@ !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), [(set _.KRPC:$dst, (X86vp2intersect - _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, + _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>, EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; } @@ -12434,12 +12432,12 @@ (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 - (X86VBroadcast (loadf32 addr:$src))))), + (X86VBroadcastld32 addr:$src)))), (VCVTNEPS2BF16Z128rmb addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), (v8i16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), v8i16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; } @@ -12466,7 +12464,7 @@ !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr), (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>, + (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>, EVEX_B, EVEX_4V; } Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,6 +103,8 @@ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -954,6 +956,26 @@ return cast(N)->getMemoryVT().getStoreSize() == 8; }]>; +def X86VBroadcastld8 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 1; +}]>; + +def X86VBroadcastld16 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 2; +}]>; + +def X86VBroadcastld32 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 4; +}]>; + +def X86VBroadcastld64 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 8; +}]>; + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -6911,10 +6911,10 @@ // class avx_broadcast_rm opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, - PatFrag ld_frag, SchedWrite Sched> : + PatFrag bcast_frag, SchedWrite Sched> : AVX8I, + [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, Sched<[Sched]>, VEX; // AVX2 adds register forms @@ -6927,15 +6927,15 @@ let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, - f32mem, v4f32, loadf32, + f32mem, v4f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>; def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, - f32mem, v8f32, loadf32, + f32mem, v8f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>, VEX_L; } let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, - v4f64, loadf64, + v4f64, X86VBroadcastld64, SchedWriteFShuffle.XMM.Folded>, VEX_L; let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { @@ -7406,7 +7406,7 @@ // destination operand // multiclass avx2_broadcast opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, + X86MemOperand x86memop, PatFrag bcast_frag, ValueType OpVT128, ValueType OpVT256, Predicate prd> { let Predicates = [HasAVX2, prd] in { def rr : AVX28I, + (OpVT128 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; def Yrr : AVX28I, + (OpVT256 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; // Provide aliases for broadcast from the same register class that @@ -7438,13 +7438,13 @@ } } -defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, v16i8, v32i8, NoVLX_Or_NoBWI>; -defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, v8i16, v16i16, NoVLX_Or_NoBWI>; -defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, v4i32, v8i32, NoVLX>; -defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2, NoVLX] in { @@ -7453,6 +7453,12 @@ (VPBROADCASTQrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8/i16. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -7473,6 +7479,12 @@ def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX] in { @@ -7518,11 +7530,11 @@ // AVX1 broadcast patterns let Predicates = [HasAVX1Only] in { -def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), +def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSrm addr:$src)>; } @@ -7532,7 +7544,7 @@ // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; - def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), @@ -7568,7 +7580,7 @@ def : Pat<(v2i64 (X86VBroadcast i64:$src)), (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; } Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -29,11 +29,10 @@ define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd %eax, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xc8] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x4c,0x24,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7c,0xc0] -; X86-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xd0] +; X86-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0xc1] +; X86-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0xd1] ; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] Index: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -49,11 +49,10 @@ define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastb %eax, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xc8] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastb %eax, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc0] -; X86-NEXT: vpbroadcastb %eax, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd0] +; X86-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0xc1] +; X86-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd1] ; X86-NEXT: vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2] ; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -79,11 +78,10 @@ define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastw %eax, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xc8] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x4c,0x24,0x02] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastw %eax, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc0] -; X86-NEXT: vpbroadcastw %eax, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd0] +; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] +; X86-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xd1] ; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] ; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] Index: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -7,11 +7,10 @@ define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastb %eax, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc8] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastb %eax, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc0] -; X86-NEXT: vpbroadcastb %eax, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd0] +; X86-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0xc1] +; X86-NEXT: vmovdqu8 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0xd1] ; X86-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -39,12 +38,11 @@ define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastw %eax, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc8] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] -; X86-NEXT: vpbroadcastw %eax, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc0] -; X86-NEXT: vpbroadcastw %eax, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x4c,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] +; X86-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd1] ; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -72,11 +70,10 @@ define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastb %eax, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xc8] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastb %eax, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc0] -; X86-NEXT: vpbroadcastb %eax, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd0] +; X86-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0xc1] +; X86-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0xd1] ; X86-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] ; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -105,11 +102,10 @@ define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastw %eax, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xc8] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastw %eax, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc0] -; X86-NEXT: vpbroadcastw %eax, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd0] +; X86-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] +; X86-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xd1] ; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -7,12 +7,11 @@ define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd %eax, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xc8] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpbroadcastd %eax, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc0] -; X86-NEXT: vpbroadcastd %eax, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xd0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x4c,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc1] +; X86-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd1] ; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -72,12 +71,11 @@ define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd %eax, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xc8] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpbroadcastd %eax, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc0] -; X86-NEXT: vpbroadcastd %eax, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xd0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x4c,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc1] +; X86-NEXT: vmovdqa32 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0xd1] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] Index: llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -496,12 +496,12 @@ ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i32_32i8: @@ -898,18 +898,18 @@ ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k4} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z} -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k4} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm2 {%k2} {z} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i64_64i8: