Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -128,6 +128,7 @@ SDValue ExpandFNEG(SDValue Op); SDValue ExpandFSUB(SDValue Op); SDValue ExpandBITREVERSE(SDValue Op); + SDValue ExpandCTPOP(SDValue Op); SDValue ExpandCTLZ(SDValue Op); SDValue ExpandCTTZ(SDValue Op); SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); @@ -719,6 +720,8 @@ return UnrollVSETCC(Op); case ISD::BITREVERSE: return ExpandBITREVERSE(Op); + case ISD::CTPOP: + return ExpandCTPOP(Op); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return ExpandCTLZ(Op); @@ -1080,6 +1083,16 @@ return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) { + // Attempt to expand using TargetLowering. + SDValue Result; + if (TLI.expandCTPOP(Op.getNode(), Result, DAG)) + return Result; + + // Otherwise go ahead and unroll. + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) { // Attempt to expand using TargetLowering. SDValue Result; Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4152,6 +4152,14 @@ assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 && "CTPOP not implemented for this type."); + // Only expand vector types if we have the appropriate vector bit operations. + if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) || + !isOperationLegalOrCustom(ISD::SUB, VT) || + !isOperationLegalOrCustom(ISD::SRL, VT) || + (Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) || + !isOperationLegalOrCustomOrPromote(ISD::AND, VT))) + return false; + // This is the "best" algorithm from // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel SDValue Mask55 = Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -25097,57 +25097,6 @@ return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); } -static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - assert(VT == MVT::v16i8 && "Only v16i8 vector CTPOP lowering supported."); - - // This is the vectorized version of the "best" algorithm from - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - // with a minor tweak to use a series of adds + shifts instead of vector - // multiplications. Implemented for all integer vector types. We only use - // this when we don't have SSSE3 which allows a LUT-based lowering that is - // much faster, even faster than using native popcnt instructions. - - auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) { - MVT VT = V.getSimpleValueType(); - SDValue ShifterV = DAG.getConstant(Shifter, DL, VT); - return DAG.getNode(OpCode, DL, VT, V, ShifterV); - }; - auto GetMask = [&](SDValue V, APInt Mask) { - MVT VT = V.getSimpleValueType(); - SDValue MaskV = DAG.getConstant(Mask, DL, VT); - return DAG.getNode(ISD::AND, DL, VT, V, MaskV); - }; - - // We don't want to incur the implicit masks required to SRL vNi8 vectors on - // x86, so set the SRL type to have elements at least i16 wide. This is - // correct because all of our SRLs are followed immediately by a mask anyways - // that handles any bits that sneak into the high bits of the byte elements. - MVT SrlVT = MVT::v8i16; - SDValue V = Op; - - // v = v - ((v >> 1) & 0x55555555...) - SDValue Srl = - DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1)); - SDValue And = GetMask(Srl, APInt(8, 0x55)); - V = DAG.getNode(ISD::SUB, DL, VT, V, And); - - // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SDValue AndLHS = GetMask(V, APInt(8, 0x33)); - Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2)); - SDValue AndRHS = GetMask(Srl, APInt(8, 0x33)); - V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS); - - // v = (v + (v >> 4)) & 0x0F0F0F0F... - Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4)); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl); - V = GetMask(Add, APInt(8, 0x0F)); - - return V; -} - // Please ensure that any codegen change from LowerVectorCTPOP is reflected in // updated cost models in X86TTIImpl::getIntrinsicInstrCost. static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, @@ -25187,9 +25136,9 @@ return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); } - // We can't use the fast LUT approach, so fall back on vectorized bitmath. + // We can't use the fast LUT approach, so fall back on LegalizeDAG. if (!Subtarget.hasSSSE3()) - return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG); + return SDValue(); return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); } Index: test/CodeGen/X86/vec_ctbits.ll =================================================================== --- test/CodeGen/X86/vec_ctbits.ll +++ test/CodeGen/X86/vec_ctbits.ll @@ -23,8 +23,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -68,8 +70,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm2 ; CHECK-NEXT: psrlw $4, %xmm2 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: paddb %xmm1, %xmm2 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm2 +; CHECK-NEXT: pand %xmm0, %xmm2 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm2, %xmm0 ; CHECK-NEXT: retq @@ -93,8 +97,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -126,8 +132,10 @@ ; CHECK-NEXT: paddb %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: psadbw %xmm0, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 @@ -173,8 +181,10 @@ ; CHECK-NEXT: paddb %xmm3, %xmm2 ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: psrlw $4, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: paddb %xmm2, %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: psadbw %xmm1, %xmm0 ; CHECK-NEXT: psubq {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq @@ -200,8 +210,10 @@ ; CHECK-NEXT: paddb %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: paddb %xmm0, %xmm1 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: psadbw %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq Index: test/CodeGen/X86/vector-lzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-lzcnt-128.ll +++ test/CodeGen/X86/vector-lzcnt-128.ll @@ -48,8 +48,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -88,8 +90,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -314,8 +318,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -354,8 +360,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -577,8 +585,10 @@ ; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -619,8 +629,10 @@ ; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -819,8 +831,10 @@ ; SSE2-NEXT: paddb %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -861,8 +875,10 @@ ; SSE3-NEXT: paddb %xmm1, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: pxor %xmm1, %xmm1 ; SSE3-NEXT: movdqa %xmm0, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] @@ -1058,8 +1074,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 @@ -1094,8 +1112,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 @@ -1264,8 +1284,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 @@ -1300,8 +1322,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: paddb %xmm1, %xmm2 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm2 +; SSE3-NEXT: pand %xmm0, %xmm2 ; SSE3-NEXT: movdqa %xmm2, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm2, %xmm0 @@ -1471,6 +1495,7 @@ ; SSE2-NEXT: paddb %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1504,6 +1529,7 @@ ; SSE3-NEXT: paddb %xmm1, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -1634,6 +1660,7 @@ ; SSE2-NEXT: paddb %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm0 ; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1667,6 +1694,7 @@ ; SSE3-NEXT: paddb %xmm1, %xmm3 ; SSE3-NEXT: movdqa %xmm3, %xmm0 ; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: paddb %xmm3, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 ; SSE3-NEXT: retq Index: test/CodeGen/X86/vector-popcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-popcnt-128.ll +++ test/CodeGen/X86/vector-popcnt-128.ll @@ -25,8 +25,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -46,8 +48,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 @@ -160,8 +164,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -186,8 +192,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -331,8 +339,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 @@ -353,8 +363,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm1, %xmm0 @@ -475,8 +487,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -494,8 +508,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; Index: test/CodeGen/X86/vector-tzcnt-128.ll =================================================================== --- test/CodeGen/X86/vector-tzcnt-128.ll +++ test/CodeGen/X86/vector-tzcnt-128.ll @@ -33,8 +33,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -57,8 +59,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 @@ -236,8 +240,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: psadbw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -260,8 +266,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: psadbw %xmm0, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 @@ -439,8 +447,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -468,8 +478,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -682,8 +694,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -711,8 +725,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: pxor %xmm0, %xmm0 ; SSE3-NEXT: movdqa %xmm1, %xmm2 ; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -925,8 +941,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 @@ -950,8 +968,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm1, %xmm0 @@ -1101,8 +1121,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: paddb %xmm1, %xmm0 @@ -1126,8 +1148,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: psllw $8, %xmm0 ; SSE3-NEXT: paddb %xmm1, %xmm0 @@ -1277,8 +1301,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1299,8 +1325,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ; @@ -1434,8 +1462,10 @@ ; SSE2-NEXT: paddb %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1456,8 +1486,10 @@ ; SSE3-NEXT: paddb %xmm2, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: paddb %xmm0, %xmm1 -; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE3-NEXT: retq ;