diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1239,7 +1239,15 @@ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); // In the customized shift lowering, the legal v4i32/v2i64 cases // in AVX2 will be recognized. @@ -1480,9 +1488,11 @@ setOperationAction(ISD::ANY_EXTEND, VT, Custom); } - setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom); + setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { @@ -1802,7 +1812,6 @@ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); @@ -2338,10 +2347,6 @@ setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); } - - setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } if (Subtarget.hasAMXTILE()) { @@ -22869,6 +22874,84 @@ return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget); } +/// This function lowers a vector truncation of 'extended sign-bits' or +/// 'extended zero-bits' values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. +static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, + const SDLoc &DL, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT SrcVT = In.getSimpleValueType(); + MVT DstSVT = DstVT.getVectorElementType(); + MVT SrcSVT = SrcVT.getVectorElementType(); + if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) && + (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32))) + return SDValue(); + + unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits(); + unsigned NumPackedSignBits = std::min(DstSVT.getSizeInBits(), 16); + unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; + + // Truncate with PACKUS if we are truncating a vector with leading zero + // bits that extend all the way to the packed/truncated value. Pre-SSE41 + // we can only use PACKUSWB. + KnownBits Known = DAG.computeKnownBits(In); + if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) + if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, + Subtarget)) + return V; + + // Truncate with PACKSS if we are truncating a vector with sign-bits + // that extend all the way to the packed/truncated value. + if ((NumSrcEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) + if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, + Subtarget)) + return V; + + return SDValue(); +} + +/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. +static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT SrcVT = In.getSimpleValueType(); + MVT DstSVT = DstVT.getVectorElementType(); + MVT SrcSVT = SrcVT.getVectorElementType(); + unsigned NumElems = DstVT.getVectorNumElements(); + if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) && + (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget.hasSSSE3() && NumElems == 8) { + if (SrcSVT == MVT::i16) + return SDValue(); + if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41())) + return SDValue(); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget.hasSSE41() || DstSVT == MVT::i8) + return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG); + + if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32) + return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG); + + // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS. + if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) { + MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In); + return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG); + } + + return SDValue(); +} + static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -22955,8 +23038,6 @@ MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); - unsigned InNumEltBits = InVT.getScalarSizeInBits(); - assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); @@ -22964,7 +23045,7 @@ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(InVT)) { if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && - VT.is128BitVector()) { + VT.is128BitVector() && Subtarget.hasAVX512()) { assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) && "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then @@ -22981,6 +23062,15 @@ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } + // Pre-AVX512 see if we can make use of PACKSS/PACKUS. + if (!Subtarget.hasAVX512()) { + if (SDValue SignPack = + LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG)) + return SignPack; + + return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG); + } + // Otherwise let default legalization handle it. return SDValue(); } @@ -22988,28 +23078,12 @@ if (VT.getVectorElementType() == MVT::i1) return LowerTruncateVecI1(Op, DAG, Subtarget); - unsigned NumPackedSignBits = std::min(VT.getScalarSizeInBits(), 16); - unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; - // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to // concat from subvectors to use VPTRUNC etc. - if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) { - // Truncate with PACKUS if we are truncating a vector with leading zero - // bits that extend all the way to the packed/truncated value. Pre-SSE41 - // we can only use PACKUSWB. - KnownBits Known = DAG.computeKnownBits(In); - if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) - if (SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, - Subtarget)) - return V; - - // Truncate with PACKSS if we are truncating a vector with sign-bits - // that extend all the way to the packed/truncated value. - if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) - if (SDValue V = truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, - Subtarget)) - return V; - } + if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG)) + if (SDValue SignPack = + LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG)) + return SignPack; // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { @@ -23068,27 +23142,9 @@ return DAG.getBitcast(MVT::v8i16, In); } - SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(0, DL)); - SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, - DAG.getIntPtrConstant(4, DL)); - - // The PSHUFB mask: - static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1}; - - OpLo = DAG.getBitcast(MVT::v8i16, OpLo); - OpHi = DAG.getBitcast(MVT::v8i16, OpHi); - - OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1); - OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1); - - OpLo = DAG.getBitcast(MVT::v4i32, OpLo); - OpHi = DAG.getBitcast(MVT::v4i32, OpHi); - - // The MOVLHPS Mask: - static const int ShufMask2[] = {0, 1, 4, 5}; - SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); - return DAG.getBitcast(MVT::v8i16, res); + return Subtarget.hasSSE41() + ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG) + : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG); } if (VT == MVT::v16i8 && InVT == MVT::v16i16) @@ -53152,6 +53208,7 @@ /// legalization the truncation will be translated into a BUILD_VECTOR with each /// element that is extracted from a vector and then truncated, and it is /// difficult to do this optimization based on them. +/// TODO: Remove this and just use LowerTruncateVecPack. static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT OutVT = N->getValueType(0); @@ -53200,6 +53257,7 @@ /// This function transforms vector truncation of 'extended sign-bits' or /// 'extended zero-bits' values. /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. +/// TODO: Remove this and just use LowerTruncateVecPackWithSignBits. static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -451,13 +451,13 @@ define i8 @v8i32_or_vselect(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { ; SSE2-SSSE3-LABEL: v8i32_or_vselect: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm1 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: packsswb %xmm4, %xmm4 -; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; @@ -514,10 +514,8 @@ define i8 @v8i32_or_select(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3, i1 %a4) { ; SSE2-SSSE3-LABEL: v8i32_or_select: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 ; SSE2-SSSE3-NEXT: testb $1, %dil ; SSE2-SSSE3-NEXT: jne .LBB7_1 ; SSE2-SSSE3-NEXT: # %bb.2: @@ -528,7 +526,9 @@ ; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: .LBB7_3: +; SSE2-SSSE3-NEXT: por %xmm3, %xmm7 ; SSE2-SSSE3-NEXT: por %xmm2, %xmm6 +; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6 ; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 ; SSE2-SSSE3-NEXT: packsswb %xmm6, %xmm6 ; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -356,21 +356,18 @@ ; ; AVX2-LABEL: v16i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm2 +; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -450,21 +447,18 @@ ; ; AVX2-LABEL: v16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vandps %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm5 -; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vandps %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vandps %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vandps %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vcmpltps %ymm5, %ymm7, %ymm2 +; AVX2-NEXT: vcmpltps %ymm4, %ymm6, %ymm3 +; AVX2-NEXT: vpackssdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpmovmskb %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -720,20 +720,16 @@ define i1 @trunc_v8i64_cmp(<8 x i64> %a0) nounwind { ; SSE2-SSSE3-LABEL: trunc_v8i64_cmp: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-SSSE3-NEXT: psllw $15, %xmm2 -; SSE2-SSSE3-NEXT: packsswb %xmm2, %xmm2 -; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm2 +; SSE2-SSSE3-NEXT: psrad $16, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm0 +; SSE2-SSSE3-NEXT: psrad $16, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: psllw $15, %xmm0 +; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 +; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax ; SSE2-SSSE3-NEXT: cmpb $-1, %al ; SSE2-SSSE3-NEXT: sete %al ; SSE2-SSSE3-NEXT: retq @@ -1471,16 +1467,12 @@ ; SSE-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE-NEXT: pcmpeqd %xmm3, %xmm1 ; SSE-NEXT: movdqu (%rdi), %xmm2 -; SSE-NEXT: movdqu 16(%rdi), %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: packssdw %xmm4, %xmm5 -; SSE-NEXT: pmovmskb %xmm5, %eax -; SSE-NEXT: testl %eax, %eax +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqu 16(%rdi), %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: packssdw %xmm0, %xmm2 +; SSE-NEXT: pmovmskb %xmm2, %eax +; SSE-NEXT: testl $43690, %eax # imm = 0xAAAA ; SSE-NEXT: setne %al ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -318,10 +318,10 @@ ; SSE41-NEXT: andps %xmm2, %xmm4 ; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm3 ; SSE41-NEXT: andps %xmm1, %xmm3 -; SSE41-NEXT: andps %xmm0, %xmm4 ; SSE41-NEXT: andps %xmm0, %xmm3 +; SSE41-NEXT: psrld $31, %xmm4 ; SSE41-NEXT: movaps %xmm3, dj+4112(%rax) -; SSE41-NEXT: movaps %xmm4, dj+4096(%rax) +; SSE41-NEXT: movdqa %xmm4, dj+4096(%rax) ; SSE41-NEXT: addq $32, %rax ; SSE41-NEXT: jne .LBB5_1 ; SSE41-NEXT: # %bb.2: # %for.end diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -5608,118 +5608,98 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigger.ptr, ptr %val.ptr, ptr %dst) { ; SSE2-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts: ; SSE2: ## %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 32(%rdi), %xmm7 +; SSE2-NEXT: movdqa 64(%rdi), %xmm8 ; SSE2-NEXT: movl 80(%rsi), %eax -; SSE2-NEXT: movl 64(%rsi), %ecx -; SSE2-NEXT: movl 48(%rsi), %r8d -; SSE2-NEXT: movl 32(%rsi), %r9d -; SSE2-NEXT: movl 16(%rsi), %r10d +; SSE2-NEXT: movl 64(%rsi), %r8d +; SSE2-NEXT: movl 48(%rsi), %r9d +; SSE2-NEXT: movl 32(%rsi), %r10d +; SSE2-NEXT: movl 16(%rsi), %r11d ; SSE2-NEXT: movdqa 80(%rsi), %xmm0 ; SSE2-NEXT: movdqa 64(%rsi), %xmm1 ; SSE2-NEXT: movdqa 48(%rsi), %xmm2 ; SSE2-NEXT: movdqa 32(%rsi), %xmm3 ; SSE2-NEXT: movdqa 16(%rsi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm9 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm10, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2-NEXT: pshuflw {{.*#+}} xmm9 = xmm10[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm8[0],xmm9[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = mem[0,2,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = mem[0,2,2,3] -; SSE2-NEXT: pcmpgtd %xmm8, %xmm7 -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: pmovmskb %xmm9, %r11d -; SSE2-NEXT: andl $21845, %r11d ## imm = 0x5555 -; SSE2-NEXT: pmovmskb %xmm7, %edi -; SSE2-NEXT: andl $85, %edi -; SSE2-NEXT: shll $16, %edi -; SSE2-NEXT: orl %r11d, %edi -; SSE2-NEXT: testb $1, %dil +; SSE2-NEXT: packssdw 48(%rdi), %xmm7 +; SSE2-NEXT: packssdw 16(%rdi), %xmm6 +; SSE2-NEXT: packsswb %xmm7, %xmm6 +; SSE2-NEXT: packssdw 80(%rdi), %xmm8 +; SSE2-NEXT: packsswb %xmm8, %xmm8 +; SSE2-NEXT: pmovmskb %xmm6, %edi +; SSE2-NEXT: andl $21845, %edi ## imm = 0x5555 +; SSE2-NEXT: pmovmskb %xmm8, %ecx +; SSE2-NEXT: andl $85, %ecx +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: orl %edi, %ecx +; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: jne LBB31_1 ; SSE2-NEXT: ## %bb.2: ## %else -; SSE2-NEXT: testb $2, %dil +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: jne LBB31_3 ; SSE2-NEXT: LBB31_4: ## %else2 -; SSE2-NEXT: testb $4, %dil +; SSE2-NEXT: testb $4, %cl ; SSE2-NEXT: jne LBB31_5 ; SSE2-NEXT: LBB31_6: ## %else4 -; SSE2-NEXT: testb $8, %dil +; SSE2-NEXT: testb $8, %cl ; SSE2-NEXT: jne LBB31_7 ; SSE2-NEXT: LBB31_8: ## %else6 -; SSE2-NEXT: testb $16, %dil +; SSE2-NEXT: testb $16, %cl ; SSE2-NEXT: jne LBB31_9 ; SSE2-NEXT: LBB31_10: ## %else8 -; SSE2-NEXT: testb $32, %dil +; SSE2-NEXT: testb $32, %cl ; SSE2-NEXT: jne LBB31_11 ; SSE2-NEXT: LBB31_12: ## %else10 -; SSE2-NEXT: testb $64, %dil +; SSE2-NEXT: testb $64, %cl ; SSE2-NEXT: jne LBB31_13 ; SSE2-NEXT: LBB31_14: ## %else12 -; SSE2-NEXT: testb %dil, %dil +; SSE2-NEXT: testb %cl, %cl ; SSE2-NEXT: js LBB31_15 ; SSE2-NEXT: LBB31_16: ## %else14 -; SSE2-NEXT: testl $256, %edi ## imm = 0x100 +; SSE2-NEXT: testl $256, %ecx ## imm = 0x100 ; SSE2-NEXT: jne LBB31_17 ; SSE2-NEXT: LBB31_18: ## %else16 -; SSE2-NEXT: testl $512, %edi ## imm = 0x200 +; SSE2-NEXT: testl $512, %ecx ## imm = 0x200 ; SSE2-NEXT: jne LBB31_19 ; SSE2-NEXT: LBB31_20: ## %else18 -; SSE2-NEXT: testl $1024, %edi ## imm = 0x400 +; SSE2-NEXT: testl $1024, %ecx ## imm = 0x400 ; SSE2-NEXT: jne LBB31_21 ; SSE2-NEXT: LBB31_22: ## %else20 -; SSE2-NEXT: testl $2048, %edi ## imm = 0x800 +; SSE2-NEXT: testl $2048, %ecx ## imm = 0x800 ; SSE2-NEXT: jne LBB31_23 ; SSE2-NEXT: LBB31_24: ## %else22 -; SSE2-NEXT: testl $4096, %edi ## imm = 0x1000 +; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000 ; SSE2-NEXT: jne LBB31_25 ; SSE2-NEXT: LBB31_26: ## %else24 -; SSE2-NEXT: testl $8192, %edi ## imm = 0x2000 +; SSE2-NEXT: testl $8192, %ecx ## imm = 0x2000 ; SSE2-NEXT: jne LBB31_27 ; SSE2-NEXT: LBB31_28: ## %else26 -; SSE2-NEXT: testl $16384, %edi ## imm = 0x4000 +; SSE2-NEXT: testl $16384, %ecx ## imm = 0x4000 ; SSE2-NEXT: jne LBB31_29 ; SSE2-NEXT: LBB31_30: ## %else28 -; SSE2-NEXT: testw %di, %di +; SSE2-NEXT: testw %cx, %cx ; SSE2-NEXT: js LBB31_31 ; SSE2-NEXT: LBB31_32: ## %else30 -; SSE2-NEXT: testl $65536, %edi ## imm = 0x10000 +; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000 ; SSE2-NEXT: jne LBB31_33 ; SSE2-NEXT: LBB31_34: ## %else32 -; SSE2-NEXT: testl $131072, %edi ## imm = 0x20000 +; SSE2-NEXT: testl $131072, %ecx ## imm = 0x20000 ; SSE2-NEXT: jne LBB31_35 ; SSE2-NEXT: LBB31_36: ## %else34 -; SSE2-NEXT: testl $262144, %edi ## imm = 0x40000 +; SSE2-NEXT: testl $262144, %ecx ## imm = 0x40000 ; SSE2-NEXT: jne LBB31_37 ; SSE2-NEXT: LBB31_38: ## %else36 -; SSE2-NEXT: testl $524288, %edi ## imm = 0x80000 +; SSE2-NEXT: testl $524288, %ecx ## imm = 0x80000 ; SSE2-NEXT: jne LBB31_39 ; SSE2-NEXT: LBB31_40: ## %else38 -; SSE2-NEXT: testl $1048576, %edi ## imm = 0x100000 +; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000 ; SSE2-NEXT: jne LBB31_41 ; SSE2-NEXT: LBB31_42: ## %else40 -; SSE2-NEXT: testl $2097152, %edi ## imm = 0x200000 +; SSE2-NEXT: testl $2097152, %ecx ## imm = 0x200000 ; SSE2-NEXT: jne LBB31_43 ; SSE2-NEXT: LBB31_44: ## %else42 -; SSE2-NEXT: testl $4194304, %edi ## imm = 0x400000 +; SSE2-NEXT: testl $4194304, %ecx ## imm = 0x400000 ; SSE2-NEXT: je LBB31_46 ; SSE2-NEXT: LBB31_45: ## %cond.store43 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -5738,123 +5718,123 @@ ; SSE2-NEXT: LBB31_1: ## %cond.store ; SSE2-NEXT: movl (%rsi), %esi ; SSE2-NEXT: movl %esi, (%rdx) -; SSE2-NEXT: testb $2, %dil +; SSE2-NEXT: testb $2, %cl ; SSE2-NEXT: je LBB31_4 ; SSE2-NEXT: LBB31_3: ## %cond.store1 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,1,1] ; SSE2-NEXT: movd %xmm6, %esi ; SSE2-NEXT: movl %esi, 4(%rdx) -; SSE2-NEXT: testb $4, %dil +; SSE2-NEXT: testb $4, %cl ; SSE2-NEXT: je LBB31_6 ; SSE2-NEXT: LBB31_5: ## %cond.store3 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] ; SSE2-NEXT: movd %xmm6, %esi ; SSE2-NEXT: movl %esi, 8(%rdx) -; SSE2-NEXT: testb $8, %dil +; SSE2-NEXT: testb $8, %cl ; SSE2-NEXT: je LBB31_8 ; SSE2-NEXT: LBB31_7: ## %cond.store5 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3] ; SSE2-NEXT: movd %xmm5, %esi ; SSE2-NEXT: movl %esi, 12(%rdx) -; SSE2-NEXT: testb $16, %dil +; SSE2-NEXT: testb $16, %cl ; SSE2-NEXT: je LBB31_10 ; SSE2-NEXT: LBB31_9: ## %cond.store7 -; SSE2-NEXT: movl %r10d, 16(%rdx) -; SSE2-NEXT: testb $32, %dil +; SSE2-NEXT: movl %r11d, 16(%rdx) +; SSE2-NEXT: testb $32, %cl ; SSE2-NEXT: je LBB31_12 ; SSE2-NEXT: LBB31_11: ## %cond.store9 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,1,1] ; SSE2-NEXT: movd %xmm5, %esi ; SSE2-NEXT: movl %esi, 20(%rdx) -; SSE2-NEXT: testb $64, %dil +; SSE2-NEXT: testb $64, %cl ; SSE2-NEXT: je LBB31_14 ; SSE2-NEXT: LBB31_13: ## %cond.store11 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3] ; SSE2-NEXT: movd %xmm5, %esi ; SSE2-NEXT: movl %esi, 24(%rdx) -; SSE2-NEXT: testb %dil, %dil +; SSE2-NEXT: testb %cl, %cl ; SSE2-NEXT: jns LBB31_16 ; SSE2-NEXT: LBB31_15: ## %cond.store13 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE2-NEXT: movd %xmm4, %esi ; SSE2-NEXT: movl %esi, 28(%rdx) -; SSE2-NEXT: testl $256, %edi ## imm = 0x100 +; SSE2-NEXT: testl $256, %ecx ## imm = 0x100 ; SSE2-NEXT: je LBB31_18 ; SSE2-NEXT: LBB31_17: ## %cond.store15 -; SSE2-NEXT: movl %r9d, 32(%rdx) -; SSE2-NEXT: testl $512, %edi ## imm = 0x200 +; SSE2-NEXT: movl %r10d, 32(%rdx) +; SSE2-NEXT: testl $512, %ecx ## imm = 0x200 ; SSE2-NEXT: je LBB31_20 ; SSE2-NEXT: LBB31_19: ## %cond.store17 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] ; SSE2-NEXT: movd %xmm4, %esi ; SSE2-NEXT: movl %esi, 36(%rdx) -; SSE2-NEXT: testl $1024, %edi ## imm = 0x400 +; SSE2-NEXT: testl $1024, %ecx ## imm = 0x400 ; SSE2-NEXT: je LBB31_22 ; SSE2-NEXT: LBB31_21: ## %cond.store19 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] ; SSE2-NEXT: movd %xmm4, %esi ; SSE2-NEXT: movl %esi, 40(%rdx) -; SSE2-NEXT: testl $2048, %edi ## imm = 0x800 +; SSE2-NEXT: testl $2048, %ecx ## imm = 0x800 ; SSE2-NEXT: je LBB31_24 ; SSE2-NEXT: LBB31_23: ## %cond.store21 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: movd %xmm3, %esi ; SSE2-NEXT: movl %esi, 44(%rdx) -; SSE2-NEXT: testl $4096, %edi ## imm = 0x1000 +; SSE2-NEXT: testl $4096, %ecx ## imm = 0x1000 ; SSE2-NEXT: je LBB31_26 ; SSE2-NEXT: LBB31_25: ## %cond.store23 -; SSE2-NEXT: movl %r8d, 48(%rdx) -; SSE2-NEXT: testl $8192, %edi ## imm = 0x2000 +; SSE2-NEXT: movl %r9d, 48(%rdx) +; SSE2-NEXT: testl $8192, %ecx ## imm = 0x2000 ; SSE2-NEXT: je LBB31_28 ; SSE2-NEXT: LBB31_27: ## %cond.store25 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] ; SSE2-NEXT: movd %xmm3, %esi ; SSE2-NEXT: movl %esi, 52(%rdx) -; SSE2-NEXT: testl $16384, %edi ## imm = 0x4000 +; SSE2-NEXT: testl $16384, %ecx ## imm = 0x4000 ; SSE2-NEXT: je LBB31_30 ; SSE2-NEXT: LBB31_29: ## %cond.store27 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE2-NEXT: movd %xmm3, %esi ; SSE2-NEXT: movl %esi, 56(%rdx) -; SSE2-NEXT: testw %di, %di +; SSE2-NEXT: testw %cx, %cx ; SSE2-NEXT: jns LBB31_32 ; SSE2-NEXT: LBB31_31: ## %cond.store29 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: movd %xmm2, %esi ; SSE2-NEXT: movl %esi, 60(%rdx) -; SSE2-NEXT: testl $65536, %edi ## imm = 0x10000 +; SSE2-NEXT: testl $65536, %ecx ## imm = 0x10000 ; SSE2-NEXT: je LBB31_34 ; SSE2-NEXT: LBB31_33: ## %cond.store31 -; SSE2-NEXT: movl %ecx, 64(%rdx) -; SSE2-NEXT: testl $131072, %edi ## imm = 0x20000 +; SSE2-NEXT: movl %r8d, 64(%rdx) +; SSE2-NEXT: testl $131072, %ecx ## imm = 0x20000 ; SSE2-NEXT: je LBB31_36 ; SSE2-NEXT: LBB31_35: ## %cond.store33 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: movl %ecx, 68(%rdx) -; SSE2-NEXT: testl $262144, %edi ## imm = 0x40000 +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: movl %esi, 68(%rdx) +; SSE2-NEXT: testl $262144, %ecx ## imm = 0x40000 ; SSE2-NEXT: je LBB31_38 ; SSE2-NEXT: LBB31_37: ## %cond.store35 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] -; SSE2-NEXT: movd %xmm2, %ecx -; SSE2-NEXT: movl %ecx, 72(%rdx) -; SSE2-NEXT: testl $524288, %edi ## imm = 0x80000 +; SSE2-NEXT: movd %xmm2, %esi +; SSE2-NEXT: movl %esi, 72(%rdx) +; SSE2-NEXT: testl $524288, %ecx ## imm = 0x80000 ; SSE2-NEXT: je LBB31_40 ; SSE2-NEXT: LBB31_39: ## %cond.store37 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: movl %ecx, 76(%rdx) -; SSE2-NEXT: testl $1048576, %edi ## imm = 0x100000 +; SSE2-NEXT: movd %xmm1, %esi +; SSE2-NEXT: movl %esi, 76(%rdx) +; SSE2-NEXT: testl $1048576, %ecx ## imm = 0x100000 ; SSE2-NEXT: je LBB31_42 ; SSE2-NEXT: LBB31_41: ## %cond.store39 ; SSE2-NEXT: movl %eax, 80(%rdx) -; SSE2-NEXT: testl $2097152, %edi ## imm = 0x200000 +; SSE2-NEXT: testl $2097152, %ecx ## imm = 0x200000 ; SSE2-NEXT: je LBB31_44 ; SSE2-NEXT: LBB31_43: ## %cond.store41 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: movl %eax, 84(%rdx) -; SSE2-NEXT: testl $4194304, %edi ## imm = 0x400000 +; SSE2-NEXT: testl $4194304, %ecx ## imm = 0x400000 ; SSE2-NEXT: jne LBB31_45 ; SSE2-NEXT: jmp LBB31_46 ; @@ -5878,6 +5858,9 @@ ; SSE4-NEXT: .cfi_offset %r14, -32 ; SSE4-NEXT: .cfi_offset %r15, -24 ; SSE4-NEXT: .cfi_offset %rbp, -16 +; SSE4-NEXT: movdqa (%rdi), %xmm1 +; SSE4-NEXT: movdqa 32(%rdi), %xmm2 +; SSE4-NEXT: movdqa 64(%rdi), %xmm0 ; SSE4-NEXT: movl 92(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 88(%rsi), %eax @@ -5900,33 +5883,14 @@ ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; SSE4-NEXT: movl 52(%rsi), %eax ; SSE4-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; SSE4-NEXT: pxor %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 48(%rdi), %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtd 32(%rdi), %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm2 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 16(%rdi), %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] -; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pcmpgtd (%rdi), %xmm3 -; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm3 -; SSE4-NEXT: packusdw %xmm2, %xmm3 -; SSE4-NEXT: pxor %xmm1, %xmm1 -; SSE4-NEXT: pcmpgtd 80(%rdi), %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7] -; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pcmpgtd 64(%rdi), %xmm2 -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7] -; SSE4-NEXT: packusdw %xmm1, %xmm2 -; SSE4-NEXT: packusdw %xmm2, %xmm2 -; SSE4-NEXT: pmovmskb %xmm3, %eax +; SSE4-NEXT: packssdw 48(%rdi), %xmm2 +; SSE4-NEXT: packssdw 16(%rdi), %xmm1 +; SSE4-NEXT: packsswb %xmm2, %xmm1 +; SSE4-NEXT: packssdw 80(%rdi), %xmm0 +; SSE4-NEXT: packsswb %xmm0, %xmm0 +; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: andl $21845, %eax ## imm = 0x5555 -; SSE4-NEXT: pmovmskb %xmm2, %edi +; SSE4-NEXT: pmovmskb %xmm0, %edi ; SSE4-NEXT: andl $85, %edi ; SSE4-NEXT: shll $16, %edi ; SSE4-NEXT: orl %eax, %edi @@ -6171,19 +6135,23 @@ ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm2 ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtd 32(%rdi), %ymm3, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtd 64(%rdi), %ymm3, %ymm5 -; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] -; AVX2-NEXT: vpslld $31, %ymm6, %ymm6 -; AVX2-NEXT: vpmaskmovd %ymm0, %ymm6, (%rdx) -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7] +; AVX2-NEXT: vpcmpgtd (%rdi), %ymm3, %ymm5 +; AVX2-NEXT: vpackssdw %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = mem[0,2,2,3,4,6,6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-NEXT: vpcmpgtd %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpacksswb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm3, (%rdx) +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -215,17 +215,13 @@ ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm1, %xmm5 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -313,63 +313,59 @@ ; SSE2-LABEL: truncstore_v8i64_v8i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm8 -; SSE2-NEXT: pand %xmm11, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] +; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm10 +; SSE2-NEXT: pand %xmm11, %xmm10 ; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: por %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,2,2] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm12, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] -; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: pand %xmm10, %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm10 +; SSE2-NEXT: por %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm8, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm12 ; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pand %xmm12, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpgtd %xmm11, %xmm10 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 +; SSE2-NEXT: pand %xmm12, %xmm1 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm11 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pand %xmm10, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 +; SSE2-NEXT: pand %xmm11, %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pxor %xmm8, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] +; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm8, %xmm3 +; SSE2-NEXT: pand %xmm9, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: por %xmm2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2] +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm7, %xmm5 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm4 @@ -403,42 +399,42 @@ ; SSE2-NEXT: .LBB1_16: # %else14 ; SSE2-NEXT: retq ; SSE2-NEXT: .LBB1_1: # %cond.store -; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: movw %cx, (%rdi) ; SSE2-NEXT: testb $2, %al ; SSE2-NEXT: je .LBB1_4 ; SSE2-NEXT: .LBB1_3: # %cond.store1 -; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: pextrw $1, %xmm1, %ecx ; SSE2-NEXT: movw %cx, 2(%rdi) ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je .LBB1_6 ; SSE2-NEXT: .LBB1_5: # %cond.store3 -; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: pextrw $2, %xmm1, %ecx ; SSE2-NEXT: movw %cx, 4(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je .LBB1_8 ; SSE2-NEXT: .LBB1_7: # %cond.store5 -; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: pextrw $3, %xmm1, %ecx ; SSE2-NEXT: movw %cx, 6(%rdi) ; SSE2-NEXT: testb $16, %al ; SSE2-NEXT: je .LBB1_10 ; SSE2-NEXT: .LBB1_9: # %cond.store7 -; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: pextrw $4, %xmm1, %ecx ; SSE2-NEXT: movw %cx, 8(%rdi) ; SSE2-NEXT: testb $32, %al ; SSE2-NEXT: je .LBB1_12 ; SSE2-NEXT: .LBB1_11: # %cond.store9 -; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: pextrw $5, %xmm1, %ecx ; SSE2-NEXT: movw %cx, 10(%rdi) ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je .LBB1_14 ; SSE2-NEXT: .LBB1_13: # %cond.store11 -; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: pextrw $6, %xmm1, %ecx ; SSE2-NEXT: movw %cx, 12(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je .LBB1_16 ; SSE2-NEXT: .LBB1_15: # %cond.store13 -; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %eax ; SSE2-NEXT: movw %ax, 14(%rdi) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -970,14 +970,13 @@ ; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4 ; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 ; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm3 ; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpacksswb %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vpackssdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 @@ -1629,7 +1628,7 @@ ; SSE2OR3-LABEL: psubus_8i64_max: ; SSE2OR3: # %bb.0: # %vector.ph ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2OR3-NEXT: movdqa %xmm2, %xmm7 +; SSE2OR3-NEXT: movdqa %xmm4, %xmm7 ; SSE2OR3-NEXT: pxor %xmm5, %xmm7 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] ; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183] @@ -1639,53 +1638,49 @@ ; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 ; SSE2OR3-NEXT: pand %xmm9, %xmm8 ; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2OR3-NEXT: pand %xmm8, %xmm2 +; SSE2OR3-NEXT: pand %xmm8, %xmm4 ; SSE2OR3-NEXT: pxor %xmm7, %xmm8 -; SSE2OR3-NEXT: por %xmm2, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] -; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2OR3-NEXT: movdqa %xmm1, %xmm8 -; SSE2OR3-NEXT: pxor %xmm5, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] +; SSE2OR3-NEXT: por %xmm4, %xmm8 +; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 +; SSE2OR3-NEXT: pxor %xmm5, %xmm4 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[0,0,2,2] ; SSE2OR3-NEXT: movdqa %xmm6, %xmm10 ; SSE2OR3-NEXT: pcmpgtd %xmm9, %xmm10 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8 -; SSE2OR3-NEXT: pand %xmm10, %xmm8 -; SSE2OR3-NEXT: pand %xmm8, %xmm1 -; SSE2OR3-NEXT: pxor %xmm7, %xmm8 -; SSE2OR3-NEXT: por %xmm1, %xmm8 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] -; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2OR3-NEXT: movdqa %xmm4, %xmm2 -; SSE2OR3-NEXT: pxor %xmm5, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4 +; SSE2OR3-NEXT: pand %xmm10, %xmm4 +; SSE2OR3-NEXT: pand %xmm4, %xmm3 +; SSE2OR3-NEXT: pxor %xmm7, %xmm4 +; SSE2OR3-NEXT: por %xmm3, %xmm4 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm8[0,2] +; SSE2OR3-NEXT: pslld $16, %xmm4 +; SSE2OR3-NEXT: psrad $16, %xmm4 +; SSE2OR3-NEXT: movdqa %xmm2, %xmm3 +; SSE2OR3-NEXT: pxor %xmm5, %xmm3 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] ; SSE2OR3-NEXT: movdqa %xmm6, %xmm9 ; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm3 +; SSE2OR3-NEXT: pand %xmm9, %xmm3 +; SSE2OR3-NEXT: pand %xmm3, %xmm2 +; SSE2OR3-NEXT: pxor %xmm7, %xmm3 +; SSE2OR3-NEXT: por %xmm2, %xmm3 +; SSE2OR3-NEXT: movdqa %xmm1, %xmm2 +; SSE2OR3-NEXT: pxor %xmm5, %xmm2 +; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] +; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2OR3-NEXT: pand %xmm9, %xmm2 -; SSE2OR3-NEXT: pand %xmm2, %xmm4 -; SSE2OR3-NEXT: pxor %xmm7, %xmm2 -; SSE2OR3-NEXT: por %xmm4, %xmm2 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE2OR3-NEXT: movdqa %xmm3, %xmm4 -; SSE2OR3-NEXT: pxor %xmm5, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm6 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE2OR3-NEXT: pand %xmm6, %xmm4 -; SSE2OR3-NEXT: pxor %xmm4, %xmm7 -; SSE2OR3-NEXT: pand %xmm3, %xmm4 -; SSE2OR3-NEXT: por %xmm7, %xmm4 -; SSE2OR3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE2OR3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSE2OR3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE2OR3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] -; SSE2OR3-NEXT: psubusw %xmm3, %xmm0 +; SSE2OR3-NEXT: pand %xmm6, %xmm2 +; SSE2OR3-NEXT: pxor %xmm2, %xmm7 +; SSE2OR3-NEXT: pand %xmm1, %xmm2 +; SSE2OR3-NEXT: por %xmm7, %xmm2 +; SSE2OR3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2OR3-NEXT: pslld $16, %xmm2 +; SSE2OR3-NEXT: psrad $16, %xmm2 +; SSE2OR3-NEXT: packssdw %xmm4, %xmm2 +; SSE2OR3-NEXT: psubusw %xmm2, %xmm0 ; SSE2OR3-NEXT: retq ; ; SSE41-LABEL: psubus_8i64_max: diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -373,20 +373,16 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; X86-SSE2-NEXT: psllw $15, %xmm2 -; X86-SSE2-NEXT: packsswb %xmm2, %xmm2 -; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X86-SSE2-NEXT: pslld $16, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: psllw $15, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm0, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: cmpb $-1, %al ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: movl %ebp, %esp @@ -395,20 +391,16 @@ ; ; X64-SSE2-LABEL: trunc_v8i64_v8i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; X64-SSE2-NEXT: psllw $15, %xmm2 -; X64-SSE2-NEXT: packsswb %xmm2, %xmm2 -; X64-SSE2-NEXT: pmovmskb %xmm2, %eax +; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; X64-SSE2-NEXT: pslld $16, %xmm2 +; X64-SSE2-NEXT: psrad $16, %xmm2 +; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE2-NEXT: pslld $16, %xmm0 +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X64-SSE2-NEXT: psllw $15, %xmm0 +; X64-SSE2-NEXT: packsswb %xmm0, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpb $-1, %al ; X64-SSE2-NEXT: sete %al ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -371,19 +371,15 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; X86-SSE2-NEXT: psllw $15, %xmm2 -; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X86-SSE2-NEXT: pslld $16, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: psllw $15, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: movl %ebp, %esp @@ -392,19 +388,15 @@ ; ; X64-SSE2-LABEL: trunc_v8i64_v8i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; X64-SSE2-NEXT: psllw $15, %xmm2 -; X64-SSE2-NEXT: pmovmskb %xmm2, %eax +; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; X64-SSE2-NEXT: pslld $16, %xmm2 +; X64-SSE2-NEXT: psrad $16, %xmm2 +; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE2-NEXT: pslld $16, %xmm0 +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X64-SSE2-NEXT: psllw $15, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; X64-SSE2-NEXT: setne %al ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -507,20 +507,16 @@ ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; X86-SSE2-NEXT: psllw $15, %xmm2 -; X86-SSE2-NEXT: packsswb %xmm2, %xmm2 -; X86-SSE2-NEXT: pmovmskb %xmm2, %eax +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X86-SSE2-NEXT: pslld $16, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2] +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: psllw $15, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm0, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testb %al, %al ; X86-SSE2-NEXT: setnp %al ; X86-SSE2-NEXT: movl %ebp, %esp @@ -529,20 +525,16 @@ ; ; X64-SSE2-LABEL: trunc_v8i64_v8i1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; X64-SSE2-NEXT: psllw $15, %xmm2 -; X64-SSE2-NEXT: packsswb %xmm2, %xmm2 -; X64-SSE2-NEXT: pmovmskb %xmm2, %eax +; X64-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; X64-SSE2-NEXT: pslld $16, %xmm2 +; X64-SSE2-NEXT: psrad $16, %xmm2 +; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-SSE2-NEXT: pslld $16, %xmm0 +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X64-SSE2-NEXT: psllw $15, %xmm0 +; X64-SSE2-NEXT: packsswb %xmm0, %xmm0 +; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: testb %al, %al ; X64-SSE2-NEXT: setnp %al ; X64-SSE2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -73,21 +73,17 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_add_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: paddq %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_add_v8i64_v8i16: @@ -515,17 +511,13 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_add_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -854,21 +846,17 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_sub_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: psubq %xmm6, %xmm2 -; SSE-NEXT: psubq %xmm7, %xmm3 -; SSE-NEXT: psubq %xmm4, %xmm0 ; SSE-NEXT: psubq %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: psubq %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: psubq %xmm7, %xmm3 +; SSE-NEXT: psubq %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_sub_v8i64_v8i16: @@ -1266,17 +1254,13 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_sub_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -1665,29 +1649,21 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: pmullw %xmm6, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] +; SSE-NEXT: pslld $16, %xmm6 +; SSE-NEXT: psrad $16, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] +; SSE-NEXT: pslld $16, %xmm4 +; SSE-NEXT: psrad $16, %xmm4 +; SSE-NEXT: packssdw %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: @@ -2194,17 +2170,13 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -2606,21 +2578,17 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_and_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: andps %xmm5, %xmm1 +; SSE-NEXT: andps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: andps %xmm7, %xmm3 +; SSE-NEXT: andps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_v8i64_v8i16: @@ -2968,18 +2936,14 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_and_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: @@ -3305,21 +3269,17 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_xor_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: xorps %xmm5, %xmm1 +; SSE-NEXT: xorps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: xorps %xmm7, %xmm3 +; SSE-NEXT: xorps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_v8i64_v8i16: @@ -3667,18 +3627,14 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_xor_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: @@ -4004,21 +3960,17 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; SSE-LABEL: trunc_or_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE-NEXT: orps %xmm5, %xmm1 +; SSE-NEXT: orps %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: orps %xmm7, %xmm3 +; SSE-NEXT: orps %xmm6, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_v8i64_v8i16: @@ -4366,18 +4318,14 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; SSE-LABEL: trunc_or_const_v8i64_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: pslld $16, %xmm2 +; SSE-NEXT: psrad $16, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: packssdw %xmm2, %xmm0 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -1505,120 +1505,112 @@ define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { ; SSE2-SSSE3-LABEL: trunc_packus_v8i64_v8i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm8 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm5 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 ; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm3 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm8 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535] -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm1[1,1,3,3] -; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183] -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm11, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] +; SSE2-SSSE3-NEXT: pxor %xmm7, %xmm7 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm8 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm8, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm8 +; SSE2-SSSE3-NEXT: por %xmm3, %xmm2 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm10 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm11, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm3 +; SSE2-SSSE3-NEXT: por %xmm8, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm10 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm10 ; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm10 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] ; SSE2-SSSE3-NEXT: pand %xmm9, %xmm11 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,1,3,3] ; SSE2-SSSE3-NEXT: por %xmm11, %xmm8 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm5 ; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm8 -; SSE2-SSSE3-NEXT: por %xmm6, %xmm8 -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm7, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm6, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: por %xmm3, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pandn %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm4 +; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm3 -; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3 -; SSE2-SSSE3-NEXT: movdqa %xmm8, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm4 ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm7, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm4 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm5 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm5, %xmm4 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] -; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: por %xmm1, %xmm3 +; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm3 +; SSE2-SSSE3-NEXT: psrad $16, %xmm3 +; SSE2-SSSE3-NEXT: pslld $16, %xmm0 +; SSE2-SSSE3-NEXT: psrad $16, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_packus_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1090,67 +1090,63 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) { ; SSE2-SSSE3-LABEL: trunc_usat_v8i64_v8i16: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm4 -; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm6 -; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm7 +; SSE2-SSSE3-NEXT: movdqa (%rdi), %xmm6 +; SSE2-SSSE3-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-SSSE3-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-SSSE3-NEXT: movdqa 48(%rdi), %xmm5 ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm5, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm7 +; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm7, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm7 +; SSE2-SSSE3-NEXT: por %xmm0, %xmm7 +; SSE2-SSSE3-NEXT: movdqa %xmm6, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pand %xmm8, %xmm0 -; SSE2-SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm9, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm6 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm0 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0 ; SSE2-SSSE3-NEXT: por %xmm6, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[0,2] +; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm6 ; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] +; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm8 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] ; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm6 -; SSE2-SSSE3-NEXT: pand %xmm6, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm6 -; SSE2-SSSE3-NEXT: por %xmm7, %xmm6 -; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm9 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm9, %xmm7 -; SSE2-SSSE3-NEXT: pand %xmm7, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSE2-SSSE3-NEXT: por %xmm4, %xmm7 -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2] -; SSE2-SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4 -; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm4 -; SSE2-SSSE3-NEXT: por %xmm5, %xmm4 -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE2-SSSE3-NEXT: pand %xmm8, %xmm6 +; SSE2-SSSE3-NEXT: pand %xmm6, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm6 +; SSE2-SSSE3-NEXT: por %xmm5, %xmm6 +; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] +; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm3 +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm5 +; SSE2-SSSE3-NEXT: pand %xmm3, %xmm5 +; SSE2-SSSE3-NEXT: pxor %xmm5, %xmm4 +; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5 +; SSE2-SSSE3-NEXT: por %xmm4, %xmm5 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm5 +; SSE2-SSSE3-NEXT: psrad $16, %xmm5 +; SSE2-SSSE3-NEXT: pslld $16, %xmm0 +; SSE2-SSSE3-NEXT: psrad $16, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc_usat_v8i64_v8i16: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -159,17 +159,13 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; SSE2-SSSE3-LABEL: trunc8i64_8i16: ; SSE2-SSSE3: # %bb.0: # %entry -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm2 +; SSE2-SSSE3-NEXT: psrad $16, %xmm2 +; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-SSSE3-NEXT: pslld $16, %xmm0 +; SSE2-SSSE3-NEXT: psrad $16, %xmm0 +; SSE2-SSSE3-NEXT: packssdw %xmm2, %xmm0 ; SSE2-SSSE3-NEXT: retq ; ; SSE41-LABEL: trunc8i64_8i16: