Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -128,6 +128,7 @@ SDValue ExpandFNEG(SDValue Op); SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); + SDValue ExpandCTPOP(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ(SDValue Op); SDValue ExpandStrictFPOp(SDValue Op); @@ -714,6 +715,8 @@ return UnrollVSETCC(Op); case ISD::BITREVERSE: return ExpandBITREVERSE(Op); + case ISD::CTPOP: + return ExpandCTPOP(Op); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return ExpandCTLZ(Op); @@ -1072,6 +1075,28 @@ return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) { + EVT VT = Op.getValueType(); + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + + // If we have the scalar operation, it's probably cheaper to unroll it. + if (TLI.isOperationLegalOrCustom(ISD::CTPOP, VT.getScalarType())) + return DAG.UnrollVectorOp(Op.getNode()); + + // If we have the appropriate vector bit operations, it is better to use them + // than unrolling and expanding each component. + if (8 <= NumBitsPerElt && isPowerOf2_32(NumBitsPerElt) && + TLI.isOperationLegalOrCustom(ISD::SUB, VT) && + TLI.isOperationLegalOrCustom(ISD::SRL, VT) && + (NumBitsPerElt == 8 || TLI.isOperationLegalOrCustom(ISD::MUL, VT)) && + TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) && + TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT)) + return Op; + + // Otherwise go ahead and unroll. + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { EVT VT = Op.getValueType(); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25135,57 +25135,6 @@ return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); } -static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported."); - - // This is the vectorized version of the "best" algorithm from - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - // with a minor tweak to use a series of adds + shifts instead of vector - // multiplications. Implemented for all integer vector types. We only use - // this when we don't have SSSE3 which allows a LUT-based lowering that is - // much faster, even faster than using native popcnt instructions. - - auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { - MVT VT = V.getSimpleValueType(); - SDValue ShifterV = DAG.getConstant(Shifter, DL, VT); - return DAG.getNode(OpCode, DL, VT, V, ShifterV); - }; - auto GetMask = [&](SDValue V, APInt Mask) { - MVT VT = V.getSimpleValueType(); - SDValue MaskV = DAG.getConstant(Mask, DL, VT); - return DAG.getNode(ISD::AND, DL, VT, V, MaskV); - }; - - // We don't want to incur the implicit masks required to SRL vNi8 vectors on - // x86, so set the SRL type to have elements at least i16 wide. This is - // correct because all of our SRLs are followed immediately by a mask anyways - // that handles any bits that sneak into the high bits of the byte elements. - MVT SrlVT = MVT::v8i16; - SDValue V = Op; - - // v = v - ((v >> 1) & 0x55555555...) - SDValue Srl = - DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); - SDValue And = GetMask(Srl, APInt(8, 0x55)); - V = DAG.getNode(ISD::SUB, DL, VT, V, And); - - // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SDValue AndLHS = GetMask(V, APInt(8, 0x33)); - Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); - SDValue AndRHS = GetMask(Srl, APInt(8, 0x33)); - V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); - - // v = (v + (v >> 4)) & 0x0F0F0F0F... - Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); - V = GetMask(Add, APInt(8, 0x0F)); - - return V; -} - // Please ensure that any codegen change from LowerVectorCTPOP is reflected in // updated cost models in X86TTIImpl::getIntrinsicInstrCost. static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, @@ -25225,9 +25174,9 @@ return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); } - // We can't use the fast LUT approach, so fall back on vectorized bitmath. + // We can't use the fast LUT approach, so fall back on LegalizeDAG. if (!Subtarget.hasSSSE3()) - return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + return SDValue(); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } Index: test/CodeGen/AArch64/arm64-vpopcnt.ll =================================================================== --- test/CodeGen/AArch64/arm64-vpopcnt.ll +++ test/CodeGen/AArch64/arm64-vpopcnt.ll @@ -17,30 +17,21 @@ define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone { ; CHECK-LABEL: ctpopv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ushr v1.4h, v0.4h, #1 +; CHECK-NEXT: movi v2.8b, #85 +; CHECK-NEXT: and v1.8b, v1.8b, v2.8b +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: movi v1.8b, #51 +; CHECK-NEXT: and v2.8b, v0.8b, v1.8b +; CHECK-NEXT: ushr v0.4h, v0.4h, #2 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: add v0.4h, v2.4h, v0.4h +; CHECK-NEXT: usra v0.4h, v0.4h, #4 +; CHECK-NEXT: movi v1.8b, #15 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: movi v1.8b, #1 +; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ushr v0.4h, v0.4h, #8 ; CHECK-NEXT: ret %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x) ret <4 x i16> %cnt @@ -84,53 +75,21 @@ define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone { ; CHECK-LABEL: ctpopv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: cnt v1.8b, v1.8b -; CHECK-NEXT: uaddlv h1, v1.8b -; CHECK-NEXT: mov v1.h[1], w8 -; CHECK-NEXT: umov w8, v0.h[2] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: umov w8, v0.h[4] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[4], w8 -; CHECK-NEXT: umov w8, v0.h[5] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[5], w8 -; CHECK-NEXT: umov w8, v0.h[6] -; CHECK-NEXT: fmov d2, x8 -; CHECK-NEXT: cnt v2.8b, v2.8b -; CHECK-NEXT: uaddlv h2, v2.8b -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov v1.h[6], w8 -; CHECK-NEXT: umov w8, v0.h[7] -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: cnt v0.8b, v0.8b -; CHECK-NEXT: uaddlv h0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov v1.h[7], w8 -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ushr v1.8h, v0.8h, #1 +; CHECK-NEXT: movi v2.16b, #85 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h +; CHECK-NEXT: movi v1.16b, #51 +; CHECK-NEXT: and v2.16b, v0.16b, v1.16b +; CHECK-NEXT: ushr v0.8h, v0.8h, #2 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: add v0.8h, v2.8h, v0.8h +; CHECK-NEXT: usra v0.8h, v0.8h, #4 +; CHECK-NEXT: movi v1.16b, #15 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v1.16b, #1 +; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ret %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x) ret <8 x i16> %cnt Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -14,17 +14,21 @@ ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -59,17 +63,21 @@ ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psubb %xmm0, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: paddb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -84,17 +92,21 @@ ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -117,17 +129,21 @@ ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pand %xmm1, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -163,17 +179,21 @@ ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: psrlw $1, %xmm0 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: psubb %xmm0, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: psrlw $2, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: pand %xmm0, %xmm1 ; CHECK-NEXT: paddb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm2, %xmm0 ; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 @@ -191,17 +211,21 @@ ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 ; CHECK-NEXT: psubb %xmm1, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; CHECK-NEXT: movdqa %xmm0, %xmm3 ; CHECK-NEXT: pand %xmm1, %xmm3 ; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: psadbw %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -39,17 +39,21 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psubb %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -79,17 +83,21 @@ ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psubb %xmm0, %xmm1 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -305,17 +313,21 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psubb %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -345,17 +357,21 @@ ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psubb %xmm0, %xmm1 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -568,17 +584,21 @@ ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psubb %xmm0, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -610,17 +630,21 @@ ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psubb %xmm0, %xmm2 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 ; SSE3-NEXT: psrlw $2, %xmm2 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -810,17 +834,21 @@ ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psubb %xmm0, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: psrlw $2, %xmm2 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -852,17 +880,21 @@ ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psubb %xmm0, %xmm2 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm2, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 ; SSE3-NEXT: psrlw $2, %xmm2 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -1049,17 +1081,21 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psubb %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 @@ -1085,17 +1121,21 @@ ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psubb %xmm0, %xmm1 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 @@ -1255,17 +1295,21 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $1, %xmm0 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: psubb %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 @@ -1291,17 +1335,21 @@ ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $1, %xmm0 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: psubb %xmm0, %xmm1 ; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 @@ -1444,67 +1492,77 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE2-LABEL: testv16i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE2-NEXT: psubb %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: paddb %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: psrlw $1, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SSE3-NEXT: pand %xmm3, %xmm2 +; SSE3-NEXT: por %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: por %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: psrlw $4, %xmm4 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: pxor %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm4 +; SSE3-NEXT: por %xmm0, %xmm4 +; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE3-NEXT: pxor %xmm4, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: psrlw $1, %xmm4 +; SSE3-NEXT: pand %xmm3, %xmm4 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE3-NEXT: psubb %xmm4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: pand %xmm3, %xmm4 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm3, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE3-NEXT: paddb %xmm4, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; @@ -1607,67 +1665,77 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE2-LABEL: testv16i8u: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: por %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlw $4, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psubb %xmm0, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: por %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrlw $1, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE2-NEXT: psubb %xmm4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: psrlw $2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: paddb %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8u: ; SSE3: # %bb.0: -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: psrlw $1, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SSE3-NEXT: pand %xmm3, %xmm2 +; SSE3-NEXT: por %xmm0, %xmm2 +; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: por %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: psrlw $4, %xmm4 ; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm0, %xmm1 -; SSE3-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE3-NEXT: pxor %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 -; SSE3-NEXT: psrlw $1, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE3-NEXT: psubb %xmm0, %xmm3 -; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; SSE3-NEXT: movdqa %xmm3, %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm4 +; SSE3-NEXT: por %xmm0, %xmm4 +; SSE3-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE3-NEXT: pxor %xmm4, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: psrlw $1, %xmm4 +; SSE3-NEXT: pand %xmm3, %xmm4 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm4 +; SSE3-NEXT: psubb %xmm4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm4 +; SSE3-NEXT: pand %xmm3, %xmm4 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm3, %xmm1 ; SSE3-NEXT: pand %xmm0, %xmm1 -; SSE3-NEXT: psrlw $2, %xmm3 -; SSE3-NEXT: pand %xmm0, %xmm3 -; SSE3-NEXT: paddb %xmm1, %xmm3 -; SSE3-NEXT: movdqa %xmm3, %xmm0 +; SSE3-NEXT: paddb %xmm4, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq ; Index: test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-128.ll +++ test/CodeGen/X86/vector-popcnt-128.ll @@ -16,17 +16,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -37,17 +41,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 @@ -151,17 +159,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -177,17 +189,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -322,17 +338,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 @@ -344,17 +364,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm1, %xmm0 @@ -466,17 +490,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -485,17 +513,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; Index: test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-128.ll +++ test/CodeGen/X86/vector-tzcnt-128.ll @@ -24,17 +24,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -48,17 +52,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 @@ -227,17 +235,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -251,17 +263,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 @@ -430,17 +446,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -459,17 +479,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -673,17 +697,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -702,17 +730,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -916,17 +948,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 @@ -941,17 +977,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm1, %xmm0 @@ -1092,17 +1132,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 @@ -1117,17 +1161,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm1, %xmm0 @@ -1268,17 +1316,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1290,17 +1342,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; @@ -1425,17 +1481,21 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE2-NEXT: psubb %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1447,17 +1507,21 @@ ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 ; SSE3-NEXT: psubb %xmm1, %xmm0 ; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: pand %xmm1, %xmm2 ; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ;