diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22027,6 +22027,50 @@ NewMask); } +// Combine shuffles of bitcasts into a shuffle of the bitcast type, providing +// the mask can be treated as a larger type. +static SDValue combineShuffleOfBitcast(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + SDValue Op0 = SVN->getOperand(0); + SDValue Op1 = SVN->getOperand(1); + EVT VT = SVN->getValueType(0); + if (Op0.getOpcode() != ISD::BITCAST) + return SDValue(); + EVT InVT = Op0.getOperand(0).getValueType(); + if (!InVT.isVector() || + (!Op1.isUndef() && (Op1.getOpcode() != ISD::BITCAST || + Op1.getOperand(0).getValueType() != InVT))) + return SDValue(); + + int VTLanes = VT.getVectorNumElements(); + int InLanes = InVT.getVectorNumElements(); + if (VTLanes <= InLanes || VTLanes % InLanes != 0 || + (LegalOperations && + !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, InVT))) + return SDValue(); + int Factor = VTLanes / InLanes; + + // Check that each group of lanes in the mask are either undef or make a valid + // mask for the wider lane type. + ArrayRef Mask = SVN->getMask(); + SmallVector NewMask; + if (!widenShuffleMaskElts(Factor, Mask, NewMask)) + return SDValue(); + + if (!TLI.isShuffleMaskLegal(NewMask, InVT)) + return SDValue(); + + // Create the new shuffle with the new mask and bitcast it back to the + // original type. + SDLoc DL(SVN); + Op0 = Op0.getOperand(0); + Op1 = Op1.isUndef() ? DAG.getUNDEF(InVT) : Op1.getOperand(0); + SDValue NewShuf = DAG.getVectorShuffle(InVT, DL, Op0, Op1, NewMask); + return DAG.getBitcast(VT, NewShuf); +} + /// Combine shuffle of shuffle of the form: /// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf, @@ -22477,6 +22521,11 @@ } } + // Match shuffles of bitcasts, so long as the mask can be treated as the + // larger type. + if (SDValue V = combineShuffleOfBitcast(SVN, DAG, TLI, LegalOperations)) + return V; + // Compute the combined shuffle mask for a shuffle with SV0 as the first // operand, and SV1 as the second operand. // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask) iff Commute = false diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1026,7 +1026,7 @@ setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::STORE, MVT::v2f64, Legal); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); if (Subtarget.hasP8Vector()) addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); @@ -1074,7 +1074,7 @@ setOperationAction(ISD::STORE, MVT::v2i64, Promote); AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal); @@ -2138,7 +2138,11 @@ /// specifies a splat of a single element that is suitable for input to /// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.). bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { - assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) && + EVT VT = N->getValueType(0); + if (VT == MVT::v2i64 || VT == MVT::v2f64) + return EltSize == 8 && N->getMaskElt(0) == N->getMaskElt(1); + + assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) && EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes"); // The consecutive indices need to specify an element, not part of two @@ -2439,6 +2443,12 @@ SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast(N); assert(isSplatShuffleMask(SVOp, EltSize)); + EVT VT = SVOp->getValueType(0); + + if (VT == MVT::v2i64 || VT == MVT::v2f64) + return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(0) + : SVOp->getMaskElt(0); + if (DAG.getDataLayout().isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else @@ -9927,6 +9937,11 @@ return LdSplt; } } + + // All v2i64 and v2f64 shuffles are legal + if (VT == MVT::v2i64 || VT == MVT::v2f64) + return Op; + if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -47,161 +47,155 @@ ; CHECK-LABEL: large: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x9, w1 +; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-NEXT: sxtw x12, w3 -; CHECK-NEXT: add x8, x0, x9 -; CHECK-NEXT: add x10, x8, x9 -; CHECK-NEXT: add x11, x10, x9 -; CHECK-NEXT: add x9, x2, x12 -; CHECK-NEXT: add x13, x9, x12 -; CHECK-NEXT: add x12, x13, x12 -; CHECK-NEXT: ldp s0, s2, [x11] +; CHECK-NEXT: sxtw x11, w3 +; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: add x12, x2, x11 +; CHECK-NEXT: add x10, x9, x8 +; CHECK-NEXT: add x13, x12, x11 +; CHECK-NEXT: add x8, x10, x8 +; CHECK-NEXT: add x11, x13, x11 +; CHECK-NEXT: ldp s1, s5, [x9] +; CHECK-NEXT: ldp s0, s4, [x8] ; CHECK-NEXT: ld1 { v0.s }[1], [x10], #4 -; CHECK-NEXT: ld1 { v0.s }[2], [x8], #4 -; CHECK-NEXT: ld1 { v0.s }[3], [x0], #4 -; CHECK-NEXT: ldp s1, s3, [x12] -; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4 +; CHECK-NEXT: ldp s2, s6, [x11] +; CHECK-NEXT: ldp s3, s7, [x12] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v1.s }[1], [x13], #4 -; CHECK-NEXT: ld1 { v1.s }[2], [x9], #4 -; CHECK-NEXT: ld1 { v1.s }[3], [x2], #4 -; CHECK-NEXT: ld1 { v2.s }[1], [x10] -; CHECK-NEXT: ld1 { v3.s }[1], [x13] -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v2.s }[2], [x8] -; CHECK-NEXT: ld1 { v3.s }[2], [x9] -; CHECK-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-NEXT: ld1 { v2.s }[3], [x0] -; CHECK-NEXT: ld1 { v3.s }[3], [x2] -; CHECK-NEXT: usubl v6.4s, v0.4h, v1.4h -; CHECK-NEXT: usubl2 v0.4s, v0.8h, v1.8h -; CHECK-NEXT: usubl v1.4s, v4.4h, v5.4h -; CHECK-NEXT: usubl2 v4.4s, v4.8h, v5.8h -; CHECK-NEXT: ushll v5.8h, v2.8b, #0 -; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: ushll v7.8h, v3.8b, #0 -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 -; CHECK-NEXT: usubl2 v16.4s, v5.8h, v7.8h -; CHECK-NEXT: usubl v5.4s, v5.4h, v7.4h +; CHECK-NEXT: ld1 { v2.s }[1], [x13], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x2], #4 +; CHECK-NEXT: ld1 { v4.s }[1], [x10] +; CHECK-NEXT: ld1 { v5.s }[1], [x0] +; CHECK-NEXT: ld1 { v6.s }[1], [x13] +; CHECK-NEXT: ld1 { v7.s }[1], [x2] ; CHECK-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: usubl v16.4s, v0.4h, v2.4h +; CHECK-NEXT: usubl2 v0.4s, v0.8h, v2.8h +; CHECK-NEXT: usubl v2.4s, v1.4h, v3.4h +; CHECK-NEXT: usubl2 v1.4s, v1.8h, v3.8h +; CHECK-NEXT: ushll v3.8h, v4.8b, #0 +; CHECK-NEXT: ushll v4.8h, v5.8b, #0 +; CHECK-NEXT: ushll v5.8h, v6.8b, #0 +; CHECK-NEXT: ushll v6.8h, v7.8b, #0 +; CHECK-NEXT: usubl2 v7.4s, v3.8h, v5.8h +; CHECK-NEXT: usubl v3.4s, v3.4h, v5.4h +; CHECK-NEXT: usubl2 v5.4s, v4.8h, v6.8h +; CHECK-NEXT: usubl v4.4s, v4.4h, v6.4h +; CHECK-NEXT: shl v6.4s, v7.4s, #16 ; CHECK-NEXT: shl v5.4s, v5.4s, #16 -; CHECK-NEXT: shl v7.4s, v16.4s, #16 -; CHECK-NEXT: usubl2 v16.4s, v2.8h, v3.8h -; CHECK-NEXT: usubl v2.4s, v2.4h, v3.4h -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s -; CHECK-NEXT: add v3.4s, v5.4s, v6.4s -; CHECK-NEXT: shl v2.4s, v2.4s, #16 -; CHECK-NEXT: shl v5.4s, v16.4s, #16 -; CHECK-NEXT: rev64 v6.4s, v3.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: add v2.4s, v5.4s, v4.4s -; CHECK-NEXT: rev64 v4.4s, v1.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: add v16.4s, v0.4s, v7.4s -; CHECK-NEXT: add v17.4s, v3.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s +; CHECK-NEXT: shl v3.4s, v3.4s, #16 +; CHECK-NEXT: shl v4.4s, v4.4s, #16 +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: add v2.4s, v4.4s, v2.4s +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: add v3.4s, v3.4s, v16.4s +; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: rev64 v5.4s, v3.4s +; CHECK-NEXT: add v18.4s, v1.4s, v6.4s +; CHECK-NEXT: add v19.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s +; CHECK-NEXT: add v16.4s, v0.4s, v4.4s +; CHECK-NEXT: zip1 v7.4s, v2.4s, v1.4s +; CHECK-NEXT: add v17.4s, v3.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: uzp2 v6.4s, v17.4s, v16.4s -; CHECK-NEXT: add v19.4s, v2.4s, v5.4s -; CHECK-NEXT: add v20.4s, v1.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: trn2 v18.4s, v17.4s, v16.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s -; CHECK-NEXT: uzp2 v4.4s, v6.4s, v17.4s -; CHECK-NEXT: zip1 v5.4s, v20.4s, v19.4s -; CHECK-NEXT: zip2 v6.4s, v20.4s, v19.4s -; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s +; CHECK-NEXT: zip2 v5.4s, v0.4s, v3.4s +; CHECK-NEXT: ext v20.16b, v17.16b, v17.16b, #12 ; CHECK-NEXT: mov v0.s[1], v3.s[0] -; CHECK-NEXT: ext v3.16b, v17.16b, v17.16b, #12 -; CHECK-NEXT: zip1 v19.4s, v1.4s, v2.4s -; CHECK-NEXT: mov v4.d[1], v6.d[1] -; CHECK-NEXT: mov v18.d[1], v5.d[1] -; CHECK-NEXT: ext v3.16b, v16.16b, v3.16b, #12 +; CHECK-NEXT: ext v3.16b, v2.16b, v7.16b, #8 +; CHECK-NEXT: mov v2.s[3], v1.s[2] +; CHECK-NEXT: zip1 v4.4s, v19.4s, v18.4s +; CHECK-NEXT: trn2 v21.4s, v17.4s, v16.4s +; CHECK-NEXT: uzp2 v6.4s, v6.4s, v17.4s ; CHECK-NEXT: mov v17.s[0], v16.s[1] -; CHECK-NEXT: ext v16.16b, v1.16b, v19.16b, #8 -; CHECK-NEXT: mov v1.s[3], v2.s[2] -; CHECK-NEXT: add v2.4s, v4.4s, v18.4s -; CHECK-NEXT: mov v3.d[1], v6.d[1] -; CHECK-NEXT: mov v17.d[1], v5.d[1] -; CHECK-NEXT: mov v0.d[1], v16.d[1] -; CHECK-NEXT: rev64 v4.4s, v2.4s -; CHECK-NEXT: mov v7.d[1], v1.d[1] -; CHECK-NEXT: sub v3.4s, v17.4s, v3.4s -; CHECK-NEXT: add v5.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: sub v4.4s, v0.4s, v7.4s -; CHECK-NEXT: add v0.4s, v7.4s, v0.4s -; CHECK-NEXT: rev64 v1.4s, v3.4s -; CHECK-NEXT: rev64 v6.4s, v4.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: rev64 v16.4s, v5.4s -; CHECK-NEXT: add v17.4s, v3.4s, v1.4s -; CHECK-NEXT: add v18.4s, v4.4s, v6.4s -; CHECK-NEXT: add v19.4s, v0.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s -; CHECK-NEXT: trn2 v3.4s, v16.4s, v2.4s +; CHECK-NEXT: zip2 v7.4s, v19.4s, v18.4s +; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: ext v1.16b, v16.16b, v20.16b, #12 +; CHECK-NEXT: mov v5.d[1], v2.d[1] +; CHECK-NEXT: mov v17.d[1], v4.d[1] +; CHECK-NEXT: mov v6.d[1], v7.d[1] +; CHECK-NEXT: mov v1.d[1], v7.d[1] +; CHECK-NEXT: add v3.4s, v5.4s, v0.4s +; CHECK-NEXT: mov v21.d[1], v4.d[1] +; CHECK-NEXT: rev64 v4.4s, v3.4s +; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s +; CHECK-NEXT: add v2.4s, v6.4s, v21.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: add v7.4s, v3.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: rev64 v5.4s, v2.4s +; CHECK-NEXT: add v17.4s, v1.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s +; CHECK-NEXT: add v19.4s, v0.4s, v4.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ext v16.16b, v7.16b, v3.16b, #4 +; CHECK-NEXT: add v18.4s, v2.4s, v5.4s ; CHECK-NEXT: ext v6.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: ext v7.16b, v19.16b, v0.16b, #4 -; CHECK-NEXT: ext v16.16b, v18.16b, v4.16b, #4 -; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #4 -; CHECK-NEXT: rev64 v6.4s, v6.4s -; CHECK-NEXT: rev64 v7.4s, v7.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s +; CHECK-NEXT: ext v5.16b, v19.16b, v0.16b, #4 ; CHECK-NEXT: rev64 v16.4s, v16.4s -; CHECK-NEXT: mov v17.s[3], v1.s[3] +; CHECK-NEXT: rev64 v6.4s, v6.4s +; CHECK-NEXT: ext v20.16b, v18.16b, v18.16b, #4 +; CHECK-NEXT: rev64 v5.4s, v5.4s +; CHECK-NEXT: mov v7.s[3], v3.s[3] +; CHECK-NEXT: ext v4.16b, v3.16b, v16.16b, #12 ; CHECK-NEXT: mov v19.s[3], v0.s[3] -; CHECK-NEXT: mov v18.s[3], v4.s[3] -; CHECK-NEXT: ext v7.16b, v0.16b, v7.16b, #12 -; CHECK-NEXT: ext v16.16b, v4.16b, v16.16b, #12 +; CHECK-NEXT: mov v17.s[3], v1.s[3] ; CHECK-NEXT: ext v6.16b, v1.16b, v6.16b, #12 -; CHECK-NEXT: trn2 v2.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v20.4s, v19.4s, v7.4s -; CHECK-NEXT: sub v21.4s, v18.4s, v16.4s -; CHECK-NEXT: sub v5.4s, v17.4s, v6.4s -; CHECK-NEXT: mov v18.s[0], v4.s[0] -; CHECK-NEXT: mov v19.s[0], v0.s[0] -; CHECK-NEXT: ext v0.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: ext v5.16b, v0.16b, v5.16b, #12 +; CHECK-NEXT: rev64 v18.4s, v18.4s +; CHECK-NEXT: trn2 v20.4s, v2.4s, v20.4s +; CHECK-NEXT: sub v16.4s, v7.4s, v4.4s +; CHECK-NEXT: sub v21.4s, v17.4s, v6.4s +; CHECK-NEXT: sub v22.4s, v19.4s, v5.4s +; CHECK-NEXT: trn2 v2.4s, v18.4s, v2.4s ; CHECK-NEXT: mov v17.s[0], v1.s[0] -; CHECK-NEXT: add v1.4s, v18.4s, v16.4s -; CHECK-NEXT: add v2.4s, v19.4s, v7.4s -; CHECK-NEXT: add v4.4s, v3.4s, v0.4s -; CHECK-NEXT: sub v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v3.4s, v17.4s, v6.4s -; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v3.d[1], v5.d[1] -; CHECK-NEXT: mov v1.d[1], v21.d[1] -; CHECK-NEXT: mov v2.d[1], v20.d[1] -; CHECK-NEXT: movi v0.8h, #1 +; CHECK-NEXT: ext v1.16b, v20.16b, v20.16b, #4 +; CHECK-NEXT: mov v19.s[0], v0.s[0] +; CHECK-NEXT: mov v7.s[0], v3.s[0] +; CHECK-NEXT: add v0.4s, v17.4s, v6.4s +; CHECK-NEXT: add v3.4s, v2.4s, v1.4s +; CHECK-NEXT: add v5.4s, v19.4s, v5.4s +; CHECK-NEXT: add v4.4s, v7.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v4.d[1], v16.d[1] +; CHECK-NEXT: mov v5.d[1], v22.d[1] +; CHECK-NEXT: mov v0.d[1], v21.d[1] +; CHECK-NEXT: mov v3.d[1], v1.d[1] +; CHECK-NEXT: movi v1.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff -; CHECK-NEXT: ushr v5.4s, v1.4s, #15 +; CHECK-NEXT: ushr v2.4s, v0.4s, #15 ; CHECK-NEXT: ushr v6.4s, v4.4s, #15 -; CHECK-NEXT: ushr v7.4s, v2.4s, #15 -; CHECK-NEXT: ushr v16.4s, v3.4s, #15 -; CHECK-NEXT: and v6.16b, v6.16b, v0.16b -; CHECK-NEXT: and v16.16b, v16.16b, v0.16b -; CHECK-NEXT: and v7.16b, v7.16b, v0.16b -; CHECK-NEXT: and v0.16b, v5.16b, v0.16b -; CHECK-NEXT: mul v5.4s, v6.4s, v17.4s +; CHECK-NEXT: ushr v7.4s, v3.4s, #15 +; CHECK-NEXT: ushr v16.4s, v5.4s, #15 +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b +; CHECK-NEXT: and v16.16b, v16.16b, v1.16b +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: mul v2.4s, v6.4s, v17.4s ; CHECK-NEXT: mul v6.4s, v16.4s, v17.4s -; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s +; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s ; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s -; CHECK-NEXT: add v4.4s, v5.4s, v4.4s -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: add v2.4s, v7.4s, v2.4s -; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b -; CHECK-NEXT: eor v1.16b, v2.16b, v7.16b -; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-NEXT: add v4.4s, v2.4s, v4.4s +; CHECK-NEXT: add v5.4s, v6.4s, v5.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v7.16b +; CHECK-NEXT: eor v3.16b, v5.16b, v6.16b +; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -1741,8 +1741,7 @@ define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vext.32 q8, q0, q0, #2 -; CHECK-NEXT: vext.32 q0, q8, q1, #2 +; CHECK-NEXT: vmov.f64 d1, d2 ; CHECK-NEXT: bx lr entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> @@ -1753,8 +1752,7 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: vext.32 q8, q0, q0, #2 -; CHECK-NEXT: vext.32 q0, q8, q1, #2 +; CHECK-NEXT: vmov.f64 d1, d2 ; CHECK-NEXT: bx lr entry: %vecext = extractelement <2 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -56,7 +56,6 @@ ; CHECK-NEXT: bne .LBB3_2 ; CHECK-NEXT: @ %bb.1: @ %bb1.preheader ; CHECK-NEXT: vmov.i32 q8, #0x0 -; CHECK-NEXT: vext.8 q8, q8, q8, #4 ; CHECK-NEXT: .LBB3_2: @ %bb2 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 diff --git a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll --- a/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll +++ b/llvm/test/CodeGen/PowerPC/aix-vsx-splatimm.ll @@ -13,33 +13,23 @@ ; CHECK-AIX-NEXT: # %bb.1: # %bb3 ; CHECK-AIX-NEXT: srwi 4, 4, 16 ; CHECK-AIX-NEXT: srwi 5, 5, 16 -; CHECK-AIX-NEXT: mullw 4, 5, 4 -; CHECK-AIX-NEXT: lwz 5, 0(3) ; CHECK-AIX-NEXT: slwi 3, 3, 8 +; CHECK-AIX-NEXT: mullw 4, 5, 4 ; CHECK-AIX-NEXT: neg 3, 3 +; CHECK-AIX-NEXT: lwz 5, 0(3) +; CHECK-AIX-NEXT: sth 3, -16(1) +; CHECK-AIX-NEXT: addi 3, 1, -16 +; CHECK-AIX-NEXT: lxvw4x 34, 0, 3 ; CHECK-AIX-NEXT: srwi 5, 5, 1 +; CHECK-AIX-NEXT: mullw 3, 4, 5 +; CHECK-AIX-NEXT: li 4, 0 +; CHECK-AIX-NEXT: vsplth 2, 2, 0 +; CHECK-AIX-NEXT: neg 3, 3 +; CHECK-AIX-NEXT: stxvw4x 34, 0, 4 ; CHECK-AIX-NEXT: sth 3, -32(1) ; CHECK-AIX-NEXT: addi 3, 1, -32 -; CHECK-AIX-NEXT: mullw 4, 4, 5 -; CHECK-AIX-NEXT: li 5, 0 -; CHECK-AIX-NEXT: sth 5, -48(1) -; CHECK-AIX-NEXT: neg 4, 4 -; CHECK-AIX-NEXT: sth 4, -16(1) -; CHECK-AIX-NEXT: addi 4, 1, -48 -; CHECK-AIX-NEXT: lxvw4x 34, 0, 4 -; CHECK-AIX-NEXT: lxvw4x 35, 0, 3 -; CHECK-AIX-NEXT: addi 3, 1, -16 -; CHECK-AIX-NEXT: lxvw4x 36, 0, 3 -; CHECK-AIX-NEXT: ld 3, L..C0(2) # %const.0 -; CHECK-AIX-NEXT: vmrghh 3, 2, 3 -; CHECK-AIX-NEXT: vmrghh 4, 4, 2 +; CHECK-AIX-NEXT: lxvw4x 34, 0, 3 ; CHECK-AIX-NEXT: vsplth 2, 2, 0 -; CHECK-AIX-NEXT: xxmrghw 34, 35, 34 -; CHECK-AIX-NEXT: lxvw4x 35, 0, 3 -; CHECK-AIX-NEXT: vperm 2, 2, 4, 3 -; CHECK-AIX-NEXT: vsplth 3, 2, 1 -; CHECK-AIX-NEXT: vsplth 2, 2, 4 -; CHECK-AIX-NEXT: stxvw4x 35, 0, 5 ; CHECK-AIX-NEXT: stxvw4x 34, 0, 3 ; ; CHECK-LABEL: test_aix_splatimm: @@ -53,25 +43,15 @@ ; CHECK-NEXT: slwi 3, 3, 8 ; CHECK-NEXT: neg 3, 3 ; CHECK-NEXT: srwi 5, 5, 1 -; CHECK-NEXT: mtvsrd 35, 3 -; CHECK-NEXT: mullw 4, 4, 5 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: mtvsrd 34, 5 -; CHECK-NEXT: vmrghh 3, 3, 2 -; CHECK-NEXT: neg 3, 4 -; CHECK-NEXT: mtvsrd 36, 3 -; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: vmrghh 4, 2, 4 -; CHECK-NEXT: vsplth 2, 2, 3 -; CHECK-NEXT: xxmrglw 34, 34, 35 -; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: mtvsrd 34, 3 ; CHECK-NEXT: li 3, 0 -; CHECK-NEXT: vperm 2, 4, 2, 3 -; CHECK-NEXT: vsplth 3, 2, 6 +; CHECK-NEXT: mullw 4, 4, 5 ; CHECK-NEXT: vsplth 2, 2, 3 -; CHECK-NEXT: stvx 3, 0, 3 ; CHECK-NEXT: stvx 2, 0, 3 +; CHECK-NEXT: neg 4, 4 +; CHECK-NEXT: mtvsrd 35, 4 +; CHECK-NEXT: vsplth 3, 3, 3 +; CHECK-NEXT: stvx 3, 0, 3 bb: br i1 undef, label %bb22, label %bb3 diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -865,14 +865,14 @@ ; ; CHECK-P9-LABEL: testSplati64_1: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv v2, 0(r3) -; CHECK-P9-NEXT: xxspltd v2, v2, 0 +; CHECK-P9-NEXT: addi r3, r3, 8 +; CHECK-P9-NEXT: lxvdsx v2, 0, r3 ; CHECK-P9-NEXT: blr ; ; CHECK-P9-BE-LABEL: testSplati64_1: ; CHECK-P9-BE: # %bb.0: # %entry -; CHECK-P9-BE-NEXT: lxv v2, 0(r3) -; CHECK-P9-BE-NEXT: xxspltd v2, v2, 1 +; CHECK-P9-BE-NEXT: addi r3, r3, 8 +; CHECK-P9-BE-NEXT: lxvdsx v2, 0, r3 ; CHECK-P9-BE-NEXT: blr ; ; CHECK-NOVSX-LABEL: testSplati64_1: diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -604,31 +604,22 @@ ; ; P9-AIX32-LABEL: adjusted_lxvwsx: ; P9-AIX32: # %bb.0: # %entry -; P9-AIX32-NEXT: lwz r3, 4(r3) -; P9-AIX32-NEXT: stw r3, -16(r1) -; P9-AIX32-NEXT: lxv vs0, -16(r1) -; P9-AIX32-NEXT: xxmrghw v2, vs0, vs0 -; P9-AIX32-NEXT: xxspltw v2, v2, 1 +; P9-AIX32-NEXT: addi r3, r3, 4 +; P9-AIX32-NEXT: lxvwsx v2, 0, r3 ; P9-AIX32-NEXT: blr ; ; P8-AIX32-LABEL: adjusted_lxvwsx: ; P8-AIX32: # %bb.0: # %entry -; P8-AIX32-NEXT: lwz r3, 4(r3) -; P8-AIX32-NEXT: addi r4, r1, -16 -; P8-AIX32-NEXT: stw r3, -16(r1) -; P8-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P8-AIX32-NEXT: xxmrghw v2, vs0, vs0 -; P8-AIX32-NEXT: xxspltw v2, v2, 1 +; P8-AIX32-NEXT: addi r3, r3, 4 +; P8-AIX32-NEXT: lfiwzx f0, 0, r3 +; P8-AIX32-NEXT: xxspltw v2, vs0, 1 ; P8-AIX32-NEXT: blr ; ; P7-AIX32-LABEL: adjusted_lxvwsx: ; P7-AIX32: # %bb.0: # %entry -; P7-AIX32-NEXT: lwz r3, 4(r3) -; P7-AIX32-NEXT: addi r4, r1, -16 -; P7-AIX32-NEXT: stw r3, -16(r1) -; P7-AIX32-NEXT: lxvw4x vs0, 0, r4 -; P7-AIX32-NEXT: xxmrghw v2, vs0, vs0 -; P7-AIX32-NEXT: xxspltw v2, v2, 1 +; P7-AIX32-NEXT: addi r3, r3, 4 +; P7-AIX32-NEXT: lfiwzx f0, 0, r3 +; P7-AIX32-NEXT: xxspltw v2, vs0, 1 ; P7-AIX32-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* diff --git a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll --- a/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll +++ b/llvm/test/CodeGen/PowerPC/vsx_shuffle_le.ll @@ -18,8 +18,7 @@ ; ; CHECK-P9-LABEL: test00: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: xxspltd 34, 0, 1 +; CHECK-P9-NEXT: lxvdsx 34, 0, 3 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, <2 x double>* %p1 %v2 = load <2 x double>, <2 x double>* %p2 @@ -113,8 +112,8 @@ ; ; CHECK-P9-LABEL: test11: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(3) -; CHECK-P9-NEXT: xxspltd 34, 0, 0 +; CHECK-P9-NEXT: addi 3, 3, 8 +; CHECK-P9-NEXT: lxvdsx 34, 0, 3 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, <2 x double>* %p1 %v2 = load <2 x double>, <2 x double>* %p2 @@ -219,8 +218,7 @@ ; ; CHECK-P9-LABEL: test22: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(4) -; CHECK-P9-NEXT: xxspltd 34, 0, 1 +; CHECK-P9-NEXT: lxvdsx 34, 0, 4 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, <2 x double>* %p1 %v2 = load <2 x double>, <2 x double>* %p2 @@ -314,8 +312,8 @@ ; ; CHECK-P9-LABEL: test33: ; CHECK-P9: # %bb.0: -; CHECK-P9-NEXT: lxv 0, 0(4) -; CHECK-P9-NEXT: xxspltd 34, 0, 0 +; CHECK-P9-NEXT: addi 3, 4, 8 +; CHECK-P9-NEXT: lxvdsx 34, 0, 3 ; CHECK-P9-NEXT: blr %v1 = load <2 x double>, <2 x double>* %p1 %v2 = load <2 x double>, <2 x double>* %p2 diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -8,8 +8,8 @@ ; CHECK-LABEL: shuffle_i16_45670123: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr @@ -279,8 +279,8 @@ ; CHECK-LABEL: shuffle_f16_45670123: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll @@ -68,22 +68,18 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, lr} ; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov r2, r12, d1 +; CHECK-NEXT: vmov r3, lr, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0], #32 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov lr, r12, d3 -; CHECK-NEXT: vmov r2, r5, d0 -; CHECK-NEXT: vmov r4, r7, d4 -; CHECK-NEXT: vmov r3, r6, d1 -; CHECK-NEXT: adds.w r3, r3, lr -; CHECK-NEXT: adc.w r6, r6, r12 -; CHECK-NEXT: adds r2, r2, r4 -; CHECK-NEXT: adcs r7, r5 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r6 +; CHECK-NEXT: vmov r4, r7, d1 +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov r3, r6, d0 +; CHECK-NEXT: adc.w r5, lr, r12 +; CHECK-NEXT: adds r3, r3, r4 +; CHECK-NEXT: adcs r7, r6 +; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r7, r5 ; CHECK-NEXT: vstrw.32 q0, [r1] ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll --- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll @@ -320,16 +320,12 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vmov lr, r12, d1 +; CHECK-NEXT: vmov r3, r2, d0 ; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s9, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov lr, r12, d3 +; CHECK-NEXT: vmov r0, r4, d1 ; CHECK-NEXT: vmov r5, r6, d0 -; CHECK-NEXT: vmov r0, r4, d4 -; CHECK-NEXT: vmov r3, r2, d1 ; CHECK-NEXT: adds.w r3, r3, lr ; CHECK-NEXT: adc.w r2, r2, r12 ; CHECK-NEXT: adds r0, r0, r5 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1202,14 +1202,16 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) { ; CHECK-LABEL: vst3_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldmia r0, {s0, s1} +; CHECK-NEXT: ldrd r2, r3, [r0] ; CHECK-NEXT: ldr r0, [r0, #8] -; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 ; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: vmovx.f16 s2, s0 ; CHECK-NEXT: vmovx.f16 s6, s4 ; CHECK-NEXT: vins.f16 s4, s2 ; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vins.f16 s2, s6 ; CHECK-NEXT: vmov r3, s2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -1051,17 +1051,19 @@ define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) { ; CHECK-LABEL: vst4_v2f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldr s0, [r0] -; CHECK-NEXT: vldr s5, [r0, #4] -; CHECK-NEXT: vldr s4, [r0, #8] +; CHECK-NEXT: ldm.w r0, {r2, r3, r12} +; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: ldr r0, [r0, #12] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.32 q1[1], r0 ; CHECK-NEXT: vmovx.f16 s2, s0 -; CHECK-NEXT: vldr s1, [r0, #12] -; CHECK-NEXT: vmovx.f16 s6, s5 +; CHECK-NEXT: vmovx.f16 s6, s1 ; CHECK-NEXT: vmovx.f16 s3, s4 ; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vins.f16 s4, s1 -; CHECK-NEXT: vins.f16 s0, s5 +; CHECK-NEXT: vmovx.f16 s6, s5 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s0, s1 ; CHECK-NEXT: vins.f16 s3, s6 ; CHECK-NEXT: vmov.f32 s1, s4 ; CHECK-NEXT: vstrh.16 q0, [r1] diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -977,8 +977,7 @@ ; X86: ## %bb.0: ## %bb ; X86-NEXT: subl $12, %esp ; X86-NEXT: movq %mm0, (%esp) -; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -810,21 +810,21 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind { ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,3,3,3,4,5,6,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] @@ -836,8 +836,8 @@ ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] ; SSE2-NEXT: pandn %xmm5, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] @@ -853,17 +853,17 @@ ; SSE42: # %bb.0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] -; SSE42-NEXT: movdqa %xmm1, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE42-NEXT: movdqa %xmm2, %xmm1 +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] +; SSE42-NEXT: movdqa %xmm0, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm2, %xmm3 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm1 -; SSE42-NEXT: movq %xmm1, 16(%rdi) +; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[13],zero,xmm2[6,14],zero,xmm2[7,15],zero,xmm2[u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm2, %xmm0 +; SSE42-NEXT: movq %xmm0, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -871,14 +871,14 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) ; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq @@ -887,10 +887,10 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,8],xmm1[0],xmm0[1,9],xmm1[1],xmm0[2,10],xmm1[2],xmm0[3,11],xmm1[3],xmm0[4,12],xmm1[4],xmm0[5] -; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[13],xmm1[5],xmm0[6,14],xmm1[6],xmm0[7,15],xmm1[7],xmm0[u,u,u,u,u,u,u,u] +; XOP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,8],xmm0[0],xmm1[1,9],xmm0[1],xmm1[2,10],xmm0[2],xmm1[3,11],xmm0[3],xmm1[4,12],xmm0[4],xmm1[5] +; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[13],xmm0[5],xmm1[6,14],xmm0[6],xmm1[7,15],xmm0[7],xmm1[u,u,u,u,u,u,u,u] ; XOP-NEXT: vmovq %xmm0, 16(%rdi) ; XOP-NEXT: vmovdqu %xmm2, (%rdi) ; XOP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-2.ll @@ -45,15 +45,15 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: vf4: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -61,7 +61,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) ; AVX512-NEXT: retq %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -62,25 +62,25 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,6,4] ; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movq %xmm2, 16(%rcx) +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movq %xmm1, 16(%rcx) ; SSE-NEXT: movdqa %xmm3, (%rcx) ; SSE-NEXT: retq ; @@ -88,15 +88,15 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,u,u,2,3,10,11,u,u,4,5,12,13] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] ; AVX1-NEXT: vmovdqa %xmm0, (%rcx) ; AVX1-NEXT: vmovq %xmm2, 16(%rcx) ; AVX1-NEXT: retq @@ -105,9 +105,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,12,13,20,21],zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[22,23,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -122,9 +122,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u> ; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -78,16 +78,16 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,3,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7] ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -99,10 +99,10 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -116,10 +116,10 @@ ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -132,10 +132,10 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] @@ -149,10 +149,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15] ; AVX512-NEXT: vpermw %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -177,10 +177,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,1,2,3] @@ -206,10 +206,10 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 @@ -233,10 +233,10 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 @@ -259,10 +259,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -153,8 +153,8 @@ ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0] +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] @@ -163,16 +163,16 @@ ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm6[0,2] ; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[0,2] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,1,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-NEXT: movaps %xmm2, 16(%rax) +; SSE-NEXT: movaps %xmm3, 16(%rax) ; SSE-NEXT: movaps %xmm5, (%rax) ; SSE-NEXT: movaps %xmm0, 32(%rax) ; SSE-NEXT: retq @@ -182,28 +182,28 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm2[0],xmm1[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm3[0],xmm4[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] +; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm5[0],xmm4[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,3,10,11,u,u,u,u,u,u,u,u,4,5,12,13] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,1,3] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,2,0] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] @@ -220,21 +220,21 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] @@ -252,23 +252,23 @@ ; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] ; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] ; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-ALL-NEXT: vpermd %ymm4, %ymm7, %ymm4 -; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] -; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm7, %ymm2 +; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -284,21 +284,21 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm2[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4,5],ymm4[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -314,13 +314,13 @@ ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -13,15 +13,15 @@ ; SSE: # %bb.0: ; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movaps %xmm0, (%rdx) +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: store_i32_stride2_vf2: ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: vmovaps %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -29,7 +29,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512-NEXT: vmovaps %xmm0, (%rdx) ; AVX512-NEXT: retq %in.vec0 = load <2 x i32>, <2 x i32>* %in.vecptr0, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -11,27 +11,27 @@ define void @store_i32_stride3_vf2(<2 x i32>* %in.vecptr0, <2 x i32>* %in.vecptr1, <2 x i32>* %in.vecptr2, <6 x i32>* %out.vec) nounwind { ; SSE-LABEL: store_i32_stride3_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1,3] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movq %xmm1, 16(%rcx) -; SSE-NEXT: movaps %xmm0, (%rcx) +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[1,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[0,2] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: movq %xmm3, 16(%rcx) +; SSE-NEXT: movaps %xmm2, (%rcx) ; SSE-NEXT: retq ; ; AVX1-LABEL: store_i32_stride3_vf2: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] @@ -45,9 +45,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,1,3,5,u,u> ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -60,9 +60,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,1,3,5,u,u> ; AVX512-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -28,10 +28,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6] @@ -45,10 +45,10 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -60,10 +60,10 @@ ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 @@ -75,10 +75,10 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] @@ -90,10 +90,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -16,20 +16,20 @@ ; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm5 = mem[0],zero -; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-NEXT: movaps %xmm0, %xmm5 -; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0] -; SSE-NEXT: movaps %xmm2, %xmm6 -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm3[0] +; SSE-NEXT: movaps %xmm0, %xmm4 +; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0] +; SSE-NEXT: movsd {{.*#+}} xmm6 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm7 = mem[0],zero +; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm6[0] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm4[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[1,3] -; SSE-NEXT: movaps %xmm6, 32(%rax) -; SSE-NEXT: movaps %xmm4, 16(%rax) +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm4[1,3] +; SSE-NEXT: movaps %xmm5, 32(%rax) +; SSE-NEXT: movaps %xmm7, 16(%rax) ; SSE-NEXT: movaps %xmm0, (%rax) ; SSE-NEXT: retq ; @@ -38,21 +38,21 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero -; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4 -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[u,u,0,2,u,u,5,7] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX1-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero +; AVX1-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero +; AVX1-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[0,2,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm3[1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3] ; AVX1-NEXT: vmovaps %xmm1, 32(%rax) ; AVX1-NEXT: vmovaps %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -63,14 +63,14 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] @@ -87,23 +87,23 @@ ; AVX2-FAST-ALL-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm3[0] -; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-ALL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7] +; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,1,3> ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-ALL-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; AVX2-FAST-ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, (%rax) -; AVX2-FAST-ALL-NEXT: vmovaps %xmm1, 32(%rax) +; AVX2-FAST-ALL-NEXT: vmovaps %xmm3, 32(%rax) ; AVX2-FAST-ALL-NEXT: vzeroupper ; AVX2-FAST-ALL-NEXT: retq ; @@ -112,14 +112,14 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] @@ -136,13 +136,13 @@ ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-2.ll @@ -77,15 +77,15 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, (%rdx) +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, (%rdx) ; SSE-NEXT: retq ; ; AVX-LABEL: store_i8_stride2_vf8: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; @@ -93,7 +93,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512-NEXT: vmovdqa %xmm0, (%rdx) ; AVX512-NEXT: retq %in.vec0 = load <8 x i8>, <8 x i8>* %in.vecptr0, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -122,21 +122,21 @@ define void @store_i8_stride3_vf8(<8 x i8>* %in.vecptr0, <8 x i8>* %in.vecptr1, <8 x i8>* %in.vecptr2, <24 x i8>* %out.vec) nounwind { ; SSE-LABEL: store_i8_stride3_vf8: ; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: pand %xmm5, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] @@ -148,8 +148,8 @@ ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] ; SSE-NEXT: pandn %xmm5, %xmm4 ; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255] @@ -165,14 +165,14 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm0[0],zero,zero,xmm0[1],zero,zero,xmm0[2],zero,zero,xmm0[3],zero,zero,xmm0[4],zero ; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[13],zero,xmm1[6,14],zero,xmm1[7,15],zero,xmm1[u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[5],zero,zero,xmm0[6],zero,zero,xmm0[7,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, 16(%rcx) ; AVX1-NEXT: vmovdqa %xmm2, (%rcx) ; AVX1-NEXT: retq @@ -181,9 +181,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -198,9 +198,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,8],zero,ymm0[1,9],zero,ymm0[2,10],zero,ymm0[3,11],zero,ymm0[4,12],zero,ymm0[5],zero,ymm0[21],zero,zero,ymm0[22],zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[0],zero,zero,ymm0[1],zero,zero,ymm0[2],zero,zero,ymm0[3],zero,zero,ymm0[4],zero,ymm0[29],zero,ymm0[22,30],zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -351,13 +351,13 @@ ; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u],zero,zero,xmm1[3,11,u,u],zero,zero,xmm1[4,12,u,u],zero,zero ; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,3,11],zero,zero,xmm0[u,u,4,12],zero,zero,xmm0[u,u,5,13] ; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 @@ -384,29 +384,29 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] ; AVX2-SLOW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -415,29 +415,29 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,8,u,u,u,u,1,9,u,u,u,u,2,10,u,u,19,27,u,u,u,u,20,28,u,u,u,u,21,29] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u] ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] ; AVX2-FAST-NEXT: vmovdqa %xmm0, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -446,13 +446,13 @@ ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2460,18 +2460,18 @@ ; ; AVX1-LABEL: shuffle_mem_pmovzx_v4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2OR512VL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2OR512VL-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi) ; AVX2OR512VL-NEXT: retq