diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1591,7 +1591,14 @@ /// chain to the token factor. This ensures that the new memory node will have /// the same relative memory dependency position as the old load. Returns the /// new merged load chain. - SDValue makeEquivalentMemoryOrdering(LoadSDNode *Old, SDValue New); + SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain); + + /// If an existing load has uses of its chain, create a token factor node with + /// that chain and the new memory node's chain and update users of the old + /// chain to the token factor. This ensures that the new memory node will have + /// the same relative memory dependency position as the old load. Returns the + /// new merged load chain. + SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp); /// Topological-sort the AllNodes list and a /// assign a unique node id for each node in the DAG based on their diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8966,25 +8966,32 @@ DbgInfo->add(DB); } -SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, - SDValue NewMemOp) { - assert(isa(NewMemOp.getNode()) && "Expected a memop node"); +SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain, + SDValue NewMemOpChain) { + assert(isa(NewMemOpChain) && "Expected a memop node"); + assert(NewMemOpChain.getValueType() == MVT::Other && "Expected a token VT"); // The new memory operation must have the same position as the old load in // terms of memory dependency. Create a TokenFactor for the old load and new // memory operation and update uses of the old load's output chain to use that // TokenFactor. - SDValue OldChain = SDValue(OldLoad, 1); - SDValue NewChain = SDValue(NewMemOp.getNode(), 1); - if (OldChain == NewChain || !OldLoad->hasAnyUseOfValue(1)) - return NewChain; + if (OldChain == NewMemOpChain || OldChain.use_empty()) + return NewMemOpChain; - SDValue TokenFactor = - getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain); + SDValue TokenFactor = getNode(ISD::TokenFactor, SDLoc(OldChain), MVT::Other, + OldChain, NewMemOpChain); ReplaceAllUsesOfValueWith(OldChain, TokenFactor); - UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain); + UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewMemOpChain); return TokenFactor; } +SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, + SDValue NewMemOp) { + assert(isa(NewMemOp.getNode()) && "Expected a memop node"); + SDValue OldChain = SDValue(OldLoad, 1); + SDValue NewMemOpChain = NewMemOp.getValue(1); + return makeEquivalentMemoryOrdering(OldChain, NewMemOpChain); +} + SDValue SelectionDAG::getSymbolFunctionGlobalAddress(SDValue Op, Function **OutFunction) { assert(isa(Op) && "Node should be an ExternalSymbol"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -776,9 +776,12 @@ // extract_vector_elt, store. VEXTRACT_STORE, - // scalar broadcast from memory + // scalar broadcast from memory. VBROADCAST_LOAD, + // subvector broadcast from memory. + SUBV_BROADCAST_LOAD, + // Store FP control world into i16 memory. FNSTCW16m, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6644,15 +6644,30 @@ } // Extract constant bits from a subvector broadcast. - if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { - SmallVector SubEltBits; - if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, - UndefElts, SubEltBits, AllowWholeUndefs, - AllowPartialUndefs)) { - UndefElts = APInt::getSplat(NumElts, UndefElts); - while (EltBits.size() < NumElts) - EltBits.append(SubEltBits.begin(), SubEltBits.end()); - return true; + if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { + auto *MemIntr = cast(Op); + SDValue Ptr = MemIntr->getBasePtr(); + if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) { + Type *CstTy = Cst->getType(); + unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); + if (!CstTy->isVectorTy() || (SizeInBits % CstSizeInBits) != 0) + return false; + unsigned SubEltSizeInBits = CstTy->getScalarSizeInBits(); + unsigned NumSubElts = CstSizeInBits / SubEltSizeInBits; + unsigned NumSubVecs = SizeInBits / CstSizeInBits; + APInt UndefSubElts(NumSubElts, 0); + SmallVector SubEltBits(NumSubElts * NumSubVecs, + APInt(SubEltSizeInBits, 0)); + for (unsigned i = 0; i != NumSubElts; ++i) { + if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i], + UndefSubElts, i)) + return false; + for (unsigned j = 1; j != NumSubVecs; ++j) + SubEltBits[i + (j * NumSubElts)] = SubEltBits[i]; + } + UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(), + UndefSubElts); + return CastBitData(UndefSubElts, SubEltBits); } } @@ -8802,17 +8817,19 @@ } if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. - MVT CVT = VT.getScalarType(); Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); + MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); Align Alignment = cast(VCP)->getAlign(); - Ld = DAG.getLoad( - MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - Alignment); - return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), VCP}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + return DAG.getMemIntrinsicNode( + X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, + MachineMemOperand::MOLoad); } } } @@ -30929,6 +30946,7 @@ NODE_NAME_CASE(VBROADCAST_LOAD) NODE_NAME_CASE(VBROADCASTM) NODE_NAME_CASE(SUBV_BROADCAST) + NODE_NAME_CASE(SUBV_BROADCAST_LOAD) NODE_NAME_CASE(VPERMILPV) NODE_NAME_CASE(VPERMILPI) NODE_NAME_CASE(VPERM2X128) @@ -38056,6 +38074,34 @@ } return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Src, 0, TLO.DAG, DL, ExtSizeInBits)); + } + case X86ISD::SUBV_BROADCAST_LOAD: { + auto *MemIntr = cast(Op); + EVT MemVT = MemIntr->getMemoryVT(); + if (ExtSizeInBits == MemVT.getStoreSizeInBits()) { + SDLoc DL(Op); + SDValue Ld = + TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), MemIntr->getMemOperand()); + TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), + Ld.getValue(1)); + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0, + TLO.DAG, DL, ExtSizeInBits)); + } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) { + SDLoc DL(Op); + EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), + ExtSizeInBits / VT.getScalarSizeInBits()); + SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); + SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; + SDValue Bcst = + TLO.DAG.getMemIntrinsicNode(X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, + Ops, MemVT, MemIntr->getMemOperand()); + TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), + Bcst.getValue(1)); + return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, + TLO.DAG, DL, ExtSizeInBits)); + } + break; } // Byte shifts by immediate. case X86ISD::VSHLDQ: @@ -44606,6 +44652,29 @@ } } + // If we also broadcast this as a subvector to a wider type, then just extract + // the lowest subvector. + if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() && + (RegVT.is128BitVector() || RegVT.is256BitVector())) { + SDValue Ptr = Ld->getBasePtr(); + SDValue Chain = Ld->getChain(); + for (SDNode *User : Ptr->uses()) { + if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + cast(User)->getBasePtr() == Ptr && + cast(User)->getChain() == Chain && + cast(User)->getMemoryVT().getSizeInBits() == + MemVT.getSizeInBits() && + !User->hasAnyUseOfValue(1) && + User->getValueSizeInBits(0).getFixedSize() > + RegVT.getFixedSizeInBits()) { + SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), + RegVT.getSizeInBits()); + Extract = DAG.getBitcast(RegVT, Extract); + return DCI.CombineTo(N, Extract, SDValue(User, 1)); + } + } + } + // Cast ptr32 and ptr64 pointers to the default address space before a load. unsigned AddrSpace = Ld->getAddressSpace(); if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || @@ -49321,7 +49390,8 @@ // extract the lowest subvector instead which should allow // SimplifyDemandedVectorElts do more simplifications. if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || - InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) + InVec.getOpcode() == X86ISD::VBROADCAST_LOAD || + InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD)) return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); // If we're extracting a broadcasted subvector, just use the source. @@ -49687,11 +49757,15 @@ return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); } -// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to -// cases where the loads have the same input chain and the output chains are -// unused. This avoids any memory ordering issues. -static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract +// from. Limit this to cases where the loads have the same input chain and the +// output chains are unused. This avoids any memory ordering issues. +static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD || + N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && + "Unknown broadcast load type"); + // Only do this if the chain result is unused. if (N->hasAnyUseOfValue(1)) return SDValue(); @@ -49706,7 +49780,7 @@ // Look at other users of our base pointer and try to find a wider broadcast. // The input chain and the size of the memory VT must match. for (SDNode *User : Ptr->uses()) - if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD && + if (User != N && User->getOpcode() == N->getOpcode() && cast(User)->getBasePtr() == Ptr && cast(User)->getChain() == Chain && cast(User)->getMemoryVT().getSizeInBits() == @@ -49963,7 +50037,8 @@ case ISD::STRICT_FP_EXTEND: case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); - case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI); + case X86ISD::VBROADCAST_LOAD: + case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1456,6 +1456,32 @@ EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { +def : Pat<(v8f64 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v32i16 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTI64X4rm addr:$src)>; + +def : Pat<(v8f64 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v32i16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4rm addr:$src)>; + def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), (VBROADCASTF64X4rm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))), @@ -1539,6 +1565,19 @@ v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; +def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTI32X4Z256rm addr:$src)>; + def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), (VBROADCASTF32X4Z256rm addr:$src)>; def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -109,6 +109,8 @@ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86SubVBroadcastld : SDNode<"X86ISD::SUBV_BROADCAST_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -965,6 +967,16 @@ return cast(N)->getMemoryVT().getStoreSize() == 8; }]>; +def X86SubVBroadcastld128 : PatFrag<(ops node:$src), + (X86SubVBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 16; +}]>; + +def X86SubVBroadcastld256 : PatFrag<(ops node:$src), + (X86SubVBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 32; +}]>; + // Scalar SSE intrinsic fragments to match several different types of loads. // Used by scalar SSE intrinsic instructions which have 128 bit types, but // only load a single element. diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7016,6 +7016,11 @@ Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; + def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), (VBROADCASTF128 addr:$src)>; def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), @@ -7025,6 +7030,15 @@ // NOTE: We're using FP instructions here, but execution domain fixing can // convert to integer when profitable. let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF128 addr:$src)>; + def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTF128 addr:$src)>; def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -493,16 +493,16 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i128: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -527,16 +527,16 @@ ; ; AVX-64-LABEL: f64xi8_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -971,16 +971,16 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i128: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] -; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1005,16 +1005,16 @@ ; ; AVX-64-LABEL: f32xi16_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] -; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1307,16 +1307,16 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i128: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] -; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1341,16 +1341,16 @@ ; ; AVX-64-LABEL: f16xi32_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] -; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] +; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 +; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1423,16 +1423,16 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; AVX-LABEL: f8xi64_i128: ; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,0] -; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2 +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] +; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -770,9 +770,9 @@ ; ; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0] +; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] +; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; X86-AVX512-NEXT: vmovdqu %ymm0, ga4 @@ -821,9 +821,9 @@ ; ; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4] +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4] +; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip) @@ -860,9 +860,9 @@ ; ; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 ; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 ; X86-AVX512-NEXT: vmovupd %ymm0, ga2 @@ -886,9 +886,9 @@ ; ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 ; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 ; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip) @@ -915,23 +915,23 @@ ; X86-AVX1-NEXT: movl %esp, %ebp ; X86-AVX1-NEXT: andl $-32, %esp ; X86-AVX1-NEXT: subl $32, %esp -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4] +; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] +; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1] ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm4 -; X86-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; X86-AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5 +; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm5 -; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; X86-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; X86-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4 +; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3 ; X86-AVX1-NEXT: vmovdqu %xmm0, ha4 ; X86-AVX1-NEXT: vmovups %ymm1, hb4 ; X86-AVX1-NEXT: vmovups %ymm3, hc4+32 @@ -947,13 +947,13 @@ ; X86-AVX2-NEXT: movl %esp, %ebp ; X86-AVX2-NEXT: andl $-32, %esp ; X86-AVX2-NEXT: subl $32, %esp -; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4] +; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] +; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4 ; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 ; X86-AVX2-NEXT: vmovdqu %xmm0, ha4 @@ -967,12 +967,11 @@ ; ; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: ; X86-AVX512: # %bb.0: # %entry -; X86-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4] +; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4] +; X86-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X86-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; X86-AVX512-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4 -; X86-AVX512-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; X86-AVX512-NEXT: vpand %ymm4, %ymm1, %ymm1 -; X86-AVX512-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm3[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X86-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; X86-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1 ; X86-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; X86-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2 ; X86-AVX512-NEXT: vmovdqu %xmm0, ha4 @@ -983,24 +982,24 @@ ; ; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4] +; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] +; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1] ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm5 -; X64-AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 -; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; X64-AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; X64-AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; X64-AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 -; X64-AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; X64-AVX1-NEXT: vmovdqu %xmm0, {{.*}}(%rip) ; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) ; X64-AVX1-NEXT: vmovups %ymm3, hc4+{{.*}}(%rip) @@ -1010,9 +1009,9 @@ ; ; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4] +; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] +; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1] ; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; X64-AVX2-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 @@ -1028,12 +1027,11 @@ ; ; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: ; X64-AVX512: # %bb.0: # %entry -; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4] +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4] +; X64-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; X64-AVX512-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm4 -; X64-AVX512-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; X64-AVX512-NEXT: vpand %ymm4, %ymm1, %ymm1 -; X64-AVX512-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm3[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; X64-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1 ; X64-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; X64-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2 ; X64-AVX512-NEXT: vmovdqu %xmm0, {{.*}}(%rip)