Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -184,6 +184,9 @@ /// Shuffle 16 8-bit values within a vector. PSHUFB, + /// Compute Sum of Absolute Differences. + PSADBW, + /// Bitwise Logical AND NOT of Packed FP values. ANDNP, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -833,12 +833,15 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each vector - // element. If popcnt isn't supported, always provide the custom version. - if (!Subtarget->hasPOPCNT()) { - setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); - setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + if (Subtarget->hasSSSE3()) { + setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + // It is faster to extract 32/64 bit elements and use scalar ctpop + // instructions on v4i32/v4i64 elements than to custom lower ctpop. + if (!Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + } } // Custom lower build_vector, vector_shuffle, and extract_vector_elt. @@ -1100,6 +1103,15 @@ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + // It is faster to extract 64 bit elements and use scalar ctpop + // instructions on v4i64 elements for avx only (not avx2). But + // always profitable if scalar popcnt is not available. + if (!Subtarget->hasPOPCNT()) + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); @@ -1130,20 +1142,13 @@ setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + // Always custom lower if avx2 is available. + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each - // vector element. If popcnt isn't supported, always provide the custom - // version. - if (!Subtarget->hasPOPCNT()) - setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - - // Custom CTPOP always performs better on natively supported v8i32 - setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); - // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -17104,141 +17109,210 @@ return SDValue(); } -static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); - - Op = Op.getOperand(0); +static SDValue LowerCTPOPInRegLUT(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "CTPOP lowering only implemented for 128/256-bit wide vector types"); - - unsigned NumElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - unsigned Len = EltVT.getSizeInBits(); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + unsigned VecSize = VT.getSizeInBits(); - // This is the vectorized version of the "best" algorithm from - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - // with a minor tweak to use a series of adds + shifts instead of vector - // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: + // Implement a lookup table in register by using an algorithm based on: + // http://wm.ite.pl/articles/sse-popcount.html // - // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled - // v8i32 => Always profitable + // The general idea is that every lower byte nibble in the input vector is an + // index into a in-register pre-computed pop count table. We then split up the + // input vector in two new ones: (1) a vector with only the shifted-right + // higher nibbles for each byte and (2) a vector with the lower nibbles (and + // masked out higher ones) for each byte. PSHUB is used separately with both + // to index the in-register table. Next, both are added and the result is a + // i8 vector where each element contains the pop count for input byte. // - // FIXME: There a couple of possible improvements: + // To obtain the pop count for elements != i8, we follow up with the same + // approach and use additional tricks as described below. // - // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). - // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html + const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; + + unsigned NumByteElts = VecSize / 8; + MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); + SDValue In = DAG.getNode(ISD::BITCAST, DL, ByteVecVT, Op); + SmallVector LUTVec; + for (unsigned i = 0; i < NumByteElts; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); + SmallVector Mask0F(NumByteElts, + DAG.getConstant(0x0F, DL, MVT::i8)); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); + + // High nibbles + SmallVector Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); + SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); + SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + HighNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, HighNibbles, M0F); + + // Low nibbles + SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + + // The input vector is used as the shuffle mask that index elements into the + // LUT. After counting low and high nibbles, add the vector to obtain the + // final pop count per i8 element. + SDValue HighPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); + SDValue LowPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); + SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); + + if (EltVT == MVT::i8) + return PopCnt; + + // PSADBW instruction horizontally add all bytes and leave the result in i64 + // chunks, thus directly computes the pop count for v2i64 and v4i64. + if (EltVT == MVT::i64) { + SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + PopCnt = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, PopCnt, Zeros); + return DAG.getNode(ISD::BITCAST, DL, VT, PopCnt); + } + + // Mask and shift to extract 32-bit components, use two PSADBW to pop count + // each one and OR the result. + if (EltVT == MVT::i32) { + unsigned Vec64NumByteElts = VecSize / 64; + MVT Vec64 = MVT::getVectorVT(MVT::i64, Vec64NumByteElts); + PopCnt = DAG.getNode(ISD::BITCAST, DL, Vec64, PopCnt); + + SmallVector MaskLow( + Vec64NumByteElts, + DAG.getConstant(APInt::getLowBitsSet(64, 32), DL, MVT::i64)); + SmallVector Dword(Vec64NumByteElts, + DAG.getConstant(32, DL, MVT::i64)); + + SDValue Low = DAG.getNode(ISD::BUILD_VECTOR, DL, Vec64, MaskLow); + SDValue High = + DAG.getNode(ISD::SRL, DL, Vec64, PopCnt, + DAG.getNode(ISD::BUILD_VECTOR, DL, Vec64, Dword)); + Low = DAG.getNode(ISD::AND, DL, Vec64, PopCnt, Low); + + SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ByteVecVT, High), Zeros); + Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ByteVecVT, Low), Zeros); + + High = DAG.getNode(ISD::SHL, DL, Vec64, + DAG.getNode(ISD::BITCAST, DL, Vec64, High), + DAG.getNode(ISD::BUILD_VECTOR, DL, Vec64, Dword)); + + PopCnt = DAG.getNode(ISD::OR, DL, Vec64, High, + DAG.getNode(ISD::BITCAST, DL, Vec64, Low)); + return DAG.getNode(ISD::BITCAST, DL, VT, PopCnt); + } + + // To obtain pop count for each i16 element, shuffle the byte pop count to get + // even and odd elements into distinct vectors, add them and zero-extend each + // i8 elemento into i16, i.e.: // - assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && - "CTPOP not implemented for this vector element type."); - - // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid - // extra legalization. - bool NeedsBitcast = EltVT == MVT::i32; - MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; - - SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, - EltVT); - SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, - EltVT); - SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, - EltVT); - - // v = v - ((v >> 1) & 0x55555555...) - SmallVector Ones(NumElts, DAG.getConstant(1, dl, EltVT)); - SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); - SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); - if (NeedsBitcast) - Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); - - SmallVector Mask55(NumElts, Cst55); - SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); - if (NeedsBitcast) - M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); - - SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); - - // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SmallVector Mask33(NumElts, Cst33); - SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); - SmallVector Twos(NumElts, DAG.getConstant(2, dl, EltVT)); - SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); - - Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); - if (NeedsBitcast) { - Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); - M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); - Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); - } - - SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); - SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); - if (VT != AndRHS.getValueType()) { - AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); - AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); - } - SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); - - // v = (v + (v >> 4)) & 0x0F0F0F0F... - SmallVector Fours(NumElts, DAG.getConstant(4, dl, EltVT)); - SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); - Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); - Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); - - SmallVector Mask0F(NumElts, Cst0F); - SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); - if (NeedsBitcast) { - Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); - M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); - } - And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - - // The algorithm mentioned above uses: - // v = (v * 0x01010101...) >> (Len - 8) + // B -> pop count per i8 + // W -> pop count per i16 // - // Change it to use vector adds + vector shifts which yield faster results on - // Haswell than using vector integer multiplication. + // Y = shuffle B, undef <0, 2, ...> + // Z = shuffle B, undef <1, 3, ...> + // W = zext <... x i8> to <... x i16> (Y + Z) // - // For i32 elements: - // v = v + (v >> 8) - // v = v + (v >> 16) + // Use a byte shuffle mask that matches PSHUFB. // - // For i64 elements: - // v = v + (v >> 8) - // v = v + (v >> 16) - // v = v + (v >> 32) + assert(EltVT == MVT::i16 && "Unknown how to handle type"); + SDValue Undef = DAG.getUNDEF(ByteVecVT); + SmallVector MaskA, MaskB; + + if (NumByteElts <= 16) { + for (unsigned i = 0; i < NumByteElts / 2; ++i) { + MaskA.push_back(i * 2); + MaskB.push_back((i * 2) + 1); + } + for (unsigned i = NumByteElts / 2; i < NumByteElts; ++i) { + MaskA.push_back(-1); + MaskB.push_back(-1); + } + SDValue ShuffA = + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, &MaskA[0]); + SDValue ShuffB = + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, &MaskB[0]); + PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB); + + // In AVX2, PSHUFB does not support cross-lane shuffle. Therefore, shuffle + // the bytes in their own lane. This requires an extra shuffle to move the + // result from the second lane to the first, i.e.: // - Add = And; - SmallVector Csts; - for (unsigned i = 8; i <= Len/2; i *= 2) { - Csts.assign(NumElts, DAG.getConstant(i, dl, EltVT)); - SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); - Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); - Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); - Csts.clear(); - } - - // The result is on the least significant 6-bits on i32 and 7-bits on i64. - SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), dl, - EltVT); - SmallVector Cst3FV(NumElts, Cst3F); - SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); - if (NeedsBitcast) { - Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); - M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); - } - And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - - return And; + // Y = shuffle B, undef <0, ... 14, -1, ... -1, 16 ...> + // Z = shuffle B, undef <1, ... 15, -1, ... -1, 17 ...> + // tmp = bitcast to v4i64 (Y + Z) + // tmp = shuffle tmp, under <0, 2, -1, -1> + // tmp = bitcast to v32i8 tmp + // W = zext <... x i8> to <... x i16> tmp + // + } else { + assert(NumByteElts == 32 && "Unknown i8 vector length to handle"); + int Idx = 0; + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 8; ++j) { + if (i % 2 == 0) { + MaskA.push_back(Idx++); + MaskB.push_back(Idx++); + } else { + MaskA.push_back(-1); + MaskB.push_back(-1); + } + } + } + SDValue ShuffA = + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, &MaskA[0]); + SDValue ShuffB = + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, &MaskB[0]); + PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB); + PopCnt = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, PopCnt); + SmallVector Mask({0, 2, -1, -1}); + PopCnt = DAG.getVectorShuffle(MVT::v4i64, DL, PopCnt, + DAG.getUNDEF(MVT::v4i64), &Mask[0]); + PopCnt = DAG.getNode(ISD::BITCAST, DL, ByteVecVT, PopCnt); + } + + // Zero extend i8 into i16 elts + SmallVector ZExtInRegMask; + for (unsigned i = 0, Idx = 0; i < NumByteElts; i += 2, ++Idx) { + ZExtInRegMask.push_back(Idx); + ZExtInRegMask.push_back(NumByteElts); + } + + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, + getZeroVector(ByteVecVT, Subtarget, DAG, DL), + &ZExtInRegMask[0])); +} + +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unknown CTPOP type to handle"); + SDLoc dl(Op.getNode()); + + if (Op.getValueType().is256BitVector() && !Subtarget->hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); + + // Extract each 128-bit vector, compute pop count and concat the result. + SDValue Op0 = Op.getOperand(0); + SDValue LHS = Extract128BitVector(Op0, 0, DAG, dl); + SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, dl); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + LowerCTPOPInRegLUT(LHS, dl, Subtarget, DAG), + LowerCTPOPInRegLUT(RHS, dl, Subtarget, DAG)); + } + + return LowerCTPOPInRegLUT(Op.getOperand(0), dl, Subtarget, DAG); } static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -78,6 +78,9 @@ def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; +def X86psadbw : SDNode<"X86ISD::PSADBW", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4017,6 +4017,20 @@ defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; +let Predicates = [HasAVX2] in + def : Pat<(v4i64 (bitconvert (v32i8 (X86psadbw (v32i8 VR256:$src1), + (v32i8 VR256:$src2))))), + (VPSADBWYrr VR256:$src2, VR256:$src1)>; + +let Predicates = [HasAVX] in + def : Pat<(v2i64 (bitconvert (v16i8 (X86psadbw (v16i8 VR128:$src1), + (v16i8 VR128:$src2))))), + (VPSADBWrr VR128:$src2, VR128:$src1)>; + +def : Pat<(v2i64 (bitconvert (v16i8 (X86psadbw (v16i8 VR128:$src1), + (v16i8 VR128:$src2))))), + (PSADBWrr VR128:$src2, VR128:$src1)>; + let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, Index: test/CodeGen/X86/avx-popcnt.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx-popcnt.ll @@ -0,0 +1,382 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -mattr=+popcnt | FileCheck -check-prefix=AVX %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -mattr=-popcnt | FileCheck -check-prefix=AVX-NOPOPCNT %s + +define <4 x i32> @testv4i32(<4 x i32> %in) { +; AVX-LABEL: testv4i32: +; AVX: # BB#0: +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: popcntl %eax, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: popcntl %ecx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: popcntl %eax, %eax +; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: popcntl %eax, %eax +; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv4i32: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NOPOPCNT-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX-NOPOPCNT-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm2, %xmm0 +; AVX-NOPOPCNT-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) + ret <4 x i32> %out +} + +define <32 x i8> @testv32i8(<32 x i8> %in) { +; AVX-LABEL: testv32i8: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv32i8: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NOPOPCNT-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) + ret <32 x i8> %out +} + +define <4 x i64> @testv4i64(<4 x i64> %in) { +; AVX-LABEL: testv4i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vpextrq $1, %xmm1, %rax +; AVX-NEXT: popcntq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vmovq %xmm1, %rax +; AVX-NEXT: popcntq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: popcntq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm2 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: popcntq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv4i64: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NOPOPCNT-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) + ret <4 x i64> %out +} + +define <8 x i32> @testv8i32(<8 x i32> %in) { +; AVX-LABEL: testv8i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX-NEXT: vpand %xmm3, %xmm1, %xmm5 +; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX-NEXT: vpsadbw %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX-NEXT: vpsadbw %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX-NEXT: vpsadbw %xmm2, %xmm6, %xmm2 +; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX-NEXT: vpsadbw %xmm0, %xmm6, %xmm0 +; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv8i32: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NOPOPCNT-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX-NOPOPCNT-NEXT: vpand %xmm3, %xmm1, %xmm5 +; AVX-NOPOPCNT-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm5, %xmm6, %xmm5 +; AVX-NOPOPCNT-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm1, %xmm6, %xmm1 +; AVX-NOPOPCNT-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm3, %xmm0, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm2, %xmm6, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm6, %xmm0 +; AVX-NOPOPCNT-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) + ret <8 x i32> %out +} + +define <2 x i64> @testv2i64(<2 x i64> %in) { +; AVX-LABEL: testv2i64: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: popcntq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: popcntq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv2i64: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) + ret <2 x i64> %out +} + +define <16 x i8> @testv16i8(<16 x i8> %in) { +; AVX-LABEL: testv16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv16i8: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) + ret <16 x i8> %out +} + +define <16 x i16> @testv16i16(<16 x i16> %in) { +; AVX-LABEL: testv16i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm5 +; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv16i16: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NOPOPCNT-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NOPOPCNT-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX-NOPOPCNT-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm5 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NOPOPCNT-NEXT: retq + %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) + ret <16 x i16> %out +} + +define <8 x i16> @testv8i16(<8 x i16> %in) { +; AVX-LABEL: testv8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: retq +; AVX-NOPOPCNT-LABEL: testv8i16: +; AVX-NOPOPCNT: # BB#0: +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX-NOPOPCNT-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NOPOPCNT-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NOPOPCNT-NEXT: retq + %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) + ret <8 x i16> %out +} + +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) +declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) +declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) Index: test/CodeGen/X86/avx2-popcnt.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2-popcnt.ll @@ -0,0 +1,93 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -mattr=+popcnt | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -mattr=-popcnt | FileCheck %s + +; When avx2 is enabled, we should always generate the same code regardless +; of popcnt instruction availability. + +define <32 x i8> @testv32i8(<32 x i8> %in) { +; CHECK-LABEL: testv32i8: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: retq + %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) + ret <32 x i8> %out +} + +define <4 x i64> @testv4i64(<4 x i64> %in) { +; CHECK-LABEL: testv4i64: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: retq + %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) + ret <4 x i64> %out +} + +define <8 x i32> @testv8i32(<8 x i32> %in) { +; CHECK-LABEL: testv8i32: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm1 +; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vpsadbw %ymm1, %ymm2, %ymm1 +; CHECK-NEXT: vpsrlq $32, %ymm0, %ymm0 +; CHECK-NEXT: vpsadbw %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0 +; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) + ret <8 x i32> %out +} + +define <16 x i16> @testv16i16(<16 x i16> %in) { +; CHECK-LABEL: testv16i16: +; CHECK: # BB#0: +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: retq + %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) + ret <16 x i16> %out +} + +declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) +declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) Index: test/CodeGen/X86/vector-ctpop.ll =================================================================== --- test/CodeGen/X86/vector-ctpop.ll +++ /dev/null @@ -1,159 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck -check-prefix=AVX2 %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s - -; Vector version of: -; v = v - ((v >> 1) & 0x55555555) -; v = (v & 0x33333333) + ((v >> 2) & 0x33333333) -; v = (v + (v >> 4) & 0xF0F0F0F) -; v = v + (v >> 8) -; v = v + (v >> 16) -; v = v + (v >> 32) ; i64 only - -define <8 x i32> @test0(<8 x i32> %x) { -; AVX2-LABEL: @test0 -entry: -; AVX2: vpsrld $1, %ymm -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand -; AVX2-NEXT: vpsubd -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand -; AVX2-NEXT: vpsrld $2 -; AVX2-NEXT: vpand -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpsrld $4 -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand -; AVX2-NEXT: vpsrld $8 -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpsrld $16 -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand - %y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x) - ret <8 x i32> %y -} - -define <4 x i64> @test1(<4 x i64> %x) { -; AVX2-NOPOPCNT-LABEL: @test1 -entry: -; AVX2-NOPOPCNT: vpsrlq $1, %ymm -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsubq -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $2 -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $4 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $8 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $16 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $32 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand - %y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x) - ret <4 x i64> %y -} - -define <4 x i32> @test2(<4 x i32> %x) { -; AVX2-NOPOPCNT-LABEL: @test2 -; AVX1-NOPOPCNT-LABEL: @test2 -entry: -; AVX2-NOPOPCNT: vpsrld $1, %xmm -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsubd -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrld $2 -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpsrld $4 -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrld $8 -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpsrld $16 -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT: vpsrld $1, %xmm -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsubd -; AVX1-NOPOPCNT-NEXT: vmovdqa -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrld $2 -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpsrld $4 -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrld $8 -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpsrld $16 -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpand - %y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x) - ret <4 x i32> %y -} - -define <2 x i64> @test3(<2 x i64> %x) { -; AVX2-NOPOPCNT-LABEL: @test3 -; AVX1-NOPOPCNT-LABEL: @test3 -entry: -; AVX2-NOPOPCNT: vpsrlq $1, %xmm -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsubq -; AVX2-NOPOPCNT-NEXT: vmovdqa -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $2 -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $4 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $8 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $16 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $32 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT: vpsrlq $1, %xmm -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsubq -; AVX1-NOPOPCNT-NEXT: vmovdqa -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrlq $2 -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpsrlq $4 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrlq $8 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpsrlq $16 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpsrlq $32 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpand - %y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x) - ret <2 x i64> %y -} - -declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) -declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) - -declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) -declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) -