Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -184,6 +184,9 @@ /// Shuffle 16 8-bit values within a vector. PSHUFB, + /// Compute Sum of Absolute Differences. + PSADBW, + /// Bitwise Logical AND NOT of Packed FP values. ANDNP, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -842,12 +842,15 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each vector - // element. If popcnt isn't supported, always provide the custom version. - if (!Subtarget->hasPOPCNT()) { - setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); - setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + if (Subtarget->hasSSSE3()) { + setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); + // It is faster to extract 32/64 bit elements and use scalar ctpop + // instructions on v4i32/v4i64 elements than to custom lower ctpop. + if (!Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + } } // Custom lower build_vector, vector_shuffle, and extract_vector_elt. @@ -1113,6 +1116,15 @@ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); + setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + // It is faster to extract 64 bit elements and use scalar ctpop + // instructions on v4i64 elements for avx only (not avx2). But + // always profitable if scalar popcnt is not available. + if (!Subtarget->hasPOPCNT()) + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); @@ -1143,20 +1155,13 @@ setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); + // Always custom lower if avx2 is available. + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); - // Only provide customized ctpop vector bit twiddling for vector types we - // know to perform better than using the popcnt instructions on each - // vector element. If popcnt isn't supported, always provide the custom - // version. - if (!Subtarget->hasPOPCNT()) - setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - - // Custom CTPOP always performs better on natively supported v8i32 - setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); - // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); @@ -17315,141 +17320,181 @@ return SDValue(); } -static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDNode *Node = Op.getNode(); - SDLoc dl(Node); - - Op = Op.getOperand(0); +static SDValue LowerCTPOPInRegLUT(SDValue Op, SDLoc DL, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); - assert((VT.is128BitVector() || VT.is256BitVector()) && - "CTPOP lowering only implemented for 128/256-bit wide vector types"); + MVT EltVT = VT.getVectorElementType().getSimpleVT(); + unsigned VecSize = VT.getSizeInBits(); - unsigned NumElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); - unsigned Len = EltVT.getSizeInBits(); - - // This is the vectorized version of the "best" algorithm from - // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - // with a minor tweak to use a series of adds + shifts instead of vector - // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: - // - // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled - // v8i32 => Always profitable + // Implement a lookup table in register by using an algorithm based on: + // http://wm.ite.pl/articles/sse-popcount.html // - // FIXME: There a couple of possible improvements: + // The general idea is that every lower byte nibble in the input vector is an + // index into a in-register pre-computed pop count table. We then split up the + // input vector in two new ones: (1) a vector with only the shifted-right + // higher nibbles for each byte and (2) a vector with the lower nibbles (and + // masked out higher ones) for each byte. PSHUB is used separately with both + // to index the in-register table. Next, both are added and the result is a + // i8 vector where each element contains the pop count for input byte. // - // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). - // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html + // To obtain the pop count for elements != i8, we follow up with the same + // approach and use additional tricks as described below. // - assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && - "CTPOP not implemented for this vector element type."); - - // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid - // extra legalization. - bool NeedsBitcast = EltVT == MVT::i32; - MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; - - SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, - EltVT); - SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, - EltVT); - SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, - EltVT); - - // v = v - ((v >> 1) & 0x55555555...) - SmallVector Ones(NumElts, DAG.getConstant(1, dl, EltVT)); - SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); - SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); - if (NeedsBitcast) - Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); - - SmallVector Mask55(NumElts, Cst55); - SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); - if (NeedsBitcast) - M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); - - SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); - - // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) - SmallVector Mask33(NumElts, Cst33); - SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); - SmallVector Twos(NumElts, DAG.getConstant(2, dl, EltVT)); - SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); - - Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); - if (NeedsBitcast) { - Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); - M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); - Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); - } - - SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); - SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); - if (VT != AndRHS.getValueType()) { - AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); - AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); - } - SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); - - // v = (v + (v >> 4)) & 0x0F0F0F0F... - SmallVector Fours(NumElts, DAG.getConstant(4, dl, EltVT)); - SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); - Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); - Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); - - SmallVector Mask0F(NumElts, Cst0F); - SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); - if (NeedsBitcast) { - Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); - M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); - } - And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - - // The algorithm mentioned above uses: - // v = (v * 0x01010101...) >> (Len - 8) + const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; + + int NumByteElts = VecSize / 8; + MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts); + SDValue In = DAG.getNode(ISD::BITCAST, DL, ByteVecVT, Op); + SmallVector LUTVec; + for (int i = 0; i < NumByteElts; ++i) + LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); + SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, LUTVec); + SmallVector Mask0F(NumByteElts, + DAG.getConstant(0x0F, DL, MVT::i8)); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Mask0F); + + // High nibbles + SmallVector Four(NumByteElts, DAG.getConstant(4, DL, MVT::i8)); + SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, DL, ByteVecVT, Four); + SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV); + + // Low nibbles + SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F); + + // The input vector is used as the shuffle mask that index elements into the + // LUT. After counting low and high nibbles, add the vector to obtain the + // final pop count per i8 element. + SDValue HighPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles); + SDValue LowPopCnt = + DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles); + SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt); + + if (EltVT == MVT::i8) + return PopCnt; + + // PSADBW instruction horizontally add all bytes and leave the result in i64 + // chunks, thus directly computes the pop count for v2i64 and v4i64. + if (EltVT == MVT::i64) { + SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + PopCnt = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, PopCnt, Zeros); + return DAG.getNode(ISD::BITCAST, DL, VT, PopCnt); + } + + int NumI64Elts = VecSize / 64; + MVT VecI64VT = MVT::getVectorVT(MVT::i64, NumI64Elts); + + // Mask and shift to extract 32-bit components, use two PSADBW to pop count + // each one and OR the result. + if (EltVT == MVT::i32) { + // We unpack the low half and high half into i32s interleaved with zeros so + // that we can use PSADBW to horizontally sum them. The most useful part of + // this is that it lines up the results of two PSADBW instructions to be + // two v2i64 vectors which concatenated are the 4 population counts. We can + // then use PACKUSWB to shrink and concatenate them into a v4i32 again. + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL); + SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, PopCnt, Zeros); + SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, PopCnt, Zeros); + + // Do the horizontal sums into two v2i64s. + Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); + Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ByteVecVT, Low), Zeros); + High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ByteVecVT, High), Zeros); + + // Merge them together. + MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); + PopCnt = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, + DAG.getNode(ISD::BITCAST, DL, ShortVecVT, Low), + DAG.getNode(ISD::BITCAST, DL, ShortVecVT, High)); + + return DAG.getNode(ISD::BITCAST, DL, VT, PopCnt); + } + + // To obtain pop count for each i16 element, shuffle the byte pop count to get + // even and odd elements into distinct vectors, add them and zero-extend each + // i8 elemento into i16, i.e.: // - // Change it to use vector adds + vector shifts which yield faster results on - // Haswell than using vector integer multiplication. + // B -> pop count per i8 + // W -> pop count per i16 // - // For i32 elements: - // v = v + (v >> 8) - // v = v + (v >> 16) + // Y = shuffle B, undef <0, 2, ...> + // Z = shuffle B, undef <1, 3, ...> + // W = zext <... x i8> to <... x i16> (Y + Z) // - // For i64 elements: - // v = v + (v >> 8) - // v = v + (v >> 16) - // v = v + (v >> 32) + // Use a byte shuffle mask that matches PSHUFB. // - Add = And; - SmallVector Csts; - for (unsigned i = 8; i <= Len/2; i *= 2) { - Csts.assign(NumElts, DAG.getConstant(i, dl, EltVT)); - SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); - Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); - Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); - Csts.clear(); + assert(EltVT == MVT::i16 && "Unknown how to handle type"); + SDValue Undef = DAG.getUNDEF(ByteVecVT); + SmallVector MaskA, MaskB; + + // We can't use PSHUFB across lanes, so do the shuffle and sum inside each + // 128-bit lane, and then collapse the result. + int NumLanes = NumByteElts / 16; + assert(NumByteElts % 16 == 0 && "Must have 16-byte multiple vectors!"); + for (int i = 0; i < NumLanes; ++i) { + for (int j = 0; j < 8; ++j) { + MaskA.push_back(i * 16 + j * 2); + MaskB.push_back(i * 16 + (j * 2) + 1); + } + MaskA.append((size_t)8, -1); + MaskB.append((size_t)8, -1); + } + + SDValue ShuffA = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskA); + SDValue ShuffB = DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, Undef, MaskB); + PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, ShuffA, ShuffB); + + SmallVector Mask; + for (int i = 0; i < NumLanes; ++i) + Mask.push_back(2 * i); + Mask.append((size_t)NumLanes, -1); + + PopCnt = DAG.getNode(ISD::BITCAST, DL, VecI64VT, PopCnt); + PopCnt = + DAG.getVectorShuffle(VecI64VT, DL, PopCnt, DAG.getUNDEF(VecI64VT), Mask); + PopCnt = DAG.getNode(ISD::BITCAST, DL, ByteVecVT, PopCnt); + + // Zero extend i8s into i16 elts + SmallVector ZExtInRegMask; + for (int i = 0; i < NumByteElts / 2; ++i) { + ZExtInRegMask.push_back(i); + ZExtInRegMask.push_back(NumByteElts); } - // The result is on the least significant 6-bits on i32 and 7-bits on i64. - SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), dl, - EltVT); - SmallVector Cst3FV(NumElts, Cst3F); - SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); - if (NeedsBitcast) { - Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); - M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(ByteVecVT, DL, PopCnt, + getZeroVector(ByteVecVT, Subtarget, DAG, DL), + ZExtInRegMask)); +} + +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unknown CTPOP type to handle"); + SDLoc dl(Op.getNode()); + + if (Op.getValueType().is256BitVector() && !Subtarget->hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); + + // Extract each 128-bit vector, compute pop count and concat the result. + SDValue Op0 = Op.getOperand(0); + SDValue LHS = Extract128BitVector(Op0, 0, DAG, dl); + SDValue RHS = Extract128BitVector(Op0, NumElems/2, DAG, dl); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + LowerCTPOPInRegLUT(LHS, dl, Subtarget, DAG), + LowerCTPOPInRegLUT(RHS, dl, Subtarget, DAG)); } - And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); - if (VT != And.getValueType()) - And = DAG.getNode(ISD::BITCAST, dl, VT, And); - return And; + return LowerCTPOPInRegLUT(Op.getOperand(0), dl, Subtarget, DAG); } static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { @@ -18145,6 +18190,7 @@ case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; + case X86ISD::PSADBW: return "X86ISD::PSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -78,6 +78,9 @@ def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; +def X86psadbw : SDNode<"X86ISD::PSADBW", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4053,6 +4053,20 @@ defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, int_x86_avx2_psad_bw, SSE_PMADD, 1>; +let Predicates = [HasAVX2] in + def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), + (v32i8 VR256:$src2))), + (VPSADBWYrr VR256:$src2, VR256:$src1)>; + +let Predicates = [HasAVX] in + def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (VPSADBWrr VR128:$src2, VR128:$src1)>; + +def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (PSADBWrr VR128:$src2, VR128:$src1)>; + let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, Index: test/CodeGen/X86/avx-popcnt.ll =================================================================== --- test/CodeGen/X86/avx-popcnt.ll +++ test/CodeGen/X86/avx-popcnt.ll @@ -20,647 +20,45 @@ ; ; AVX-NOPOPCNT-LABEL: testv4i32: ; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NOPOPCNT-NEXT: vpsrld $2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrld $4, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrld $8, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX-NOPOPCNT-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NOPOPCNT-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX-NOPOPCNT-NEXT: retq %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) ret <4 x i32> %out } define <32 x i8> @testv32i8(<32 x i8> %in) { -; AVX-POPCNT-LABEL: testv32i8: -; AVX-POPCNT: # BB#0: -; AVX-POPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $1, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpextrb $0, %xmm1, %ecx -; AVX-POPCNT-NEXT: popcntw %cx, %cx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-POPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $2, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $3, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $4, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $5, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $6, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $7, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $8, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $9, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $10, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $11, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $12, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $13, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $14, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $15, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $1, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpextrb $0, %xmm0, %ecx -; AVX-POPCNT-NEXT: popcntw %cx, %cx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-POPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $2, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $3, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $4, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $5, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $6, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $7, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $8, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $9, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $10, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $11, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $12, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $13, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $14, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrb $15, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; AVX-POPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-POPCNT-NEXT: retq -; -; AVX-NOPOPCNT-LABEL: testv32i8: -; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $1, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpextrb $0, %xmm1, %ecx -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: shrb %dl -; AVX-NOPOPCNT-NEXT: andb $85, %dl -; AVX-NOPOPCNT-NEXT: subb %dl, %cl -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: andb $51, %dl -; AVX-NOPOPCNT-NEXT: shrb $2, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: addb %dl, %cl -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: shrb $4, %dl -; AVX-NOPOPCNT-NEXT: addb %cl, %dl -; AVX-NOPOPCNT-NEXT: andb $15, %dl -; AVX-NOPOPCNT-NEXT: movzbl %dl, %ecx -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-NOPOPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $2, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $3, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $4, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $5, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $6, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $7, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $8, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $9, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $10, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $11, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $12, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $13, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $14, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $15, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $1, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpextrb $0, %xmm0, %ecx -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: shrb %dl -; AVX-NOPOPCNT-NEXT: andb $85, %dl -; AVX-NOPOPCNT-NEXT: subb %dl, %cl -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: andb $51, %dl -; AVX-NOPOPCNT-NEXT: shrb $2, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: addb %dl, %cl -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: shrb $4, %dl -; AVX-NOPOPCNT-NEXT: addb %cl, %dl -; AVX-NOPOPCNT-NEXT: andb $15, %dl -; AVX-NOPOPCNT-NEXT: movzbl %dl, %ecx -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-NOPOPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $2, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $4, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $5, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $6, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $7, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $8, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $9, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $10, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $11, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $12, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $13, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $14, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrb $15, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NOPOPCNT-NEXT: retq +; AVX-LABEL: testv32i8: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } @@ -689,80 +87,23 @@ ; AVX-NOPOPCNT-LABEL: testv4i64: ; AVX-NOPOPCNT: # BB#0: ; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrq $1, %xmm1, %rdx -; AVX-NOPOPCNT-NEXT: movq %rdx, %rax -; AVX-NOPOPCNT-NEXT: shrq %rax -; AVX-NOPOPCNT-NEXT: movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555 -; AVX-NOPOPCNT-NEXT: andq %r8, %rax -; AVX-NOPOPCNT-NEXT: subq %rax, %rdx -; AVX-NOPOPCNT-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333 -; AVX-NOPOPCNT-NEXT: movq %rdx, %rsi -; AVX-NOPOPCNT-NEXT: andq %rax, %rsi -; AVX-NOPOPCNT-NEXT: shrq $2, %rdx -; AVX-NOPOPCNT-NEXT: andq %rax, %rdx -; AVX-NOPOPCNT-NEXT: addq %rsi, %rdx -; AVX-NOPOPCNT-NEXT: movq %rdx, %rdi -; AVX-NOPOPCNT-NEXT: shrq $4, %rdi -; AVX-NOPOPCNT-NEXT: addq %rdx, %rdi -; AVX-NOPOPCNT-NEXT: movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F -; AVX-NOPOPCNT-NEXT: andq %rdx, %rdi -; AVX-NOPOPCNT-NEXT: movabsq $72340172838076673, %rsi # imm = 0x101010101010101 -; AVX-NOPOPCNT-NEXT: imulq %rsi, %rdi -; AVX-NOPOPCNT-NEXT: shrq $56, %rdi -; AVX-NOPOPCNT-NEXT: vmovq %rdi, %xmm2 -; AVX-NOPOPCNT-NEXT: vmovq %xmm1, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: shrq %rdi -; AVX-NOPOPCNT-NEXT: andq %r8, %rdi -; AVX-NOPOPCNT-NEXT: subq %rdi, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: andq %rax, %rdi -; AVX-NOPOPCNT-NEXT: shrq $2, %rcx -; AVX-NOPOPCNT-NEXT: andq %rax, %rcx -; AVX-NOPOPCNT-NEXT: addq %rdi, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: shrq $4, %rdi -; AVX-NOPOPCNT-NEXT: addq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: andq %rdx, %rdi -; AVX-NOPOPCNT-NEXT: imulq %rsi, %rdi -; AVX-NOPOPCNT-NEXT: shrq $56, %rdi -; AVX-NOPOPCNT-NEXT: vmovq %rdi, %xmm1 -; AVX-NOPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NOPOPCNT-NEXT: vpextrq $1, %xmm0, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: shrq %rdi -; AVX-NOPOPCNT-NEXT: andq %r8, %rdi -; AVX-NOPOPCNT-NEXT: subq %rdi, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: andq %rax, %rdi -; AVX-NOPOPCNT-NEXT: shrq $2, %rcx -; AVX-NOPOPCNT-NEXT: andq %rax, %rcx -; AVX-NOPOPCNT-NEXT: addq %rdi, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: shrq $4, %rdi -; AVX-NOPOPCNT-NEXT: addq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: andq %rdx, %rdi -; AVX-NOPOPCNT-NEXT: imulq %rsi, %rdi -; AVX-NOPOPCNT-NEXT: shrq $56, %rdi -; AVX-NOPOPCNT-NEXT: vmovq %rdi, %xmm2 -; AVX-NOPOPCNT-NEXT: vmovq %xmm0, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: shrq %rdi -; AVX-NOPOPCNT-NEXT: andq %r8, %rdi -; AVX-NOPOPCNT-NEXT: subq %rdi, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rdi -; AVX-NOPOPCNT-NEXT: andq %rax, %rdi -; AVX-NOPOPCNT-NEXT: shrq $2, %rcx -; AVX-NOPOPCNT-NEXT: andq %rax, %rcx -; AVX-NOPOPCNT-NEXT: addq %rdi, %rcx -; AVX-NOPOPCNT-NEXT: movq %rcx, %rax -; AVX-NOPOPCNT-NEXT: shrq $4, %rax -; AVX-NOPOPCNT-NEXT: addq %rcx, %rax -; AVX-NOPOPCNT-NEXT: andq %rdx, %rax -; AVX-NOPOPCNT-NEXT: imulq %rsi, %rax -; AVX-NOPOPCNT-NEXT: shrq $56, %rax -; AVX-NOPOPCNT-NEXT: vmovq %rax, %xmm0 -; AVX-NOPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-NOPOPCNT-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX-NOPOPCNT-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 ; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NOPOPCNT-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) @@ -770,177 +111,36 @@ } define <8 x i32> @testv8i32(<8 x i32> %in) { -; AVX-POPCNT-LABEL: testv8i32: -; AVX-POPCNT: # BB#0: -; AVX-POPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-POPCNT-NEXT: vpextrd $1, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntl %eax, %eax -; AVX-POPCNT-NEXT: vmovd %xmm1, %ecx -; AVX-POPCNT-NEXT: popcntl %ecx, %ecx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-POPCNT-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrd $2, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntl %eax, %eax -; AVX-POPCNT-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrd $3, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntl %eax, %eax -; AVX-POPCNT-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-POPCNT-NEXT: vpextrd $1, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntl %eax, %eax -; AVX-POPCNT-NEXT: vmovd %xmm0, %ecx -; AVX-POPCNT-NEXT: popcntl %ecx, %ecx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-POPCNT-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrd $2, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntl %eax, %eax -; AVX-POPCNT-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrd $3, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntl %eax, %eax -; AVX-POPCNT-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; AVX-POPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-POPCNT-NEXT: retq -; -; AVX-NOPOPCNT-LABEL: testv8i32: -; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrd $1, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %eax -; AVX-NOPOPCNT-NEXT: vmovd %xmm1, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl %edx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $858993459, %edx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl $4, %edx -; AVX-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %ecx -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-NOPOPCNT-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrd $2, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %eax -; AVX-NOPOPCNT-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrd $3, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %eax -; AVX-NOPOPCNT-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrd $1, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %eax -; AVX-NOPOPCNT-NEXT: vmovd %xmm0, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl %edx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $858993459, %edx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl $4, %edx -; AVX-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %ecx -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-NOPOPCNT-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %eax -; AVX-NOPOPCNT-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrd $3, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $1431655765, %ecx # imm = 0x55555555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $858993459, %eax # imm = 0x33333333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NOPOPCNT-NEXT: imull $16843009, %ecx, %eax # imm = 0x1010101 -; AVX-NOPOPCNT-NEXT: shrl $24, %eax -; AVX-NOPOPCNT-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 -; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NOPOPCNT-NEXT: retq +; AVX-LABEL: testv8i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX-NEXT: vpsadbw %xmm5, %xmm3, %xmm5 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out } @@ -959,873 +159,87 @@ ; ; AVX-NOPOPCNT-LABEL: testv2i64: ; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NOPOPCNT-NEXT: vpsrlq $2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NOPOPCNT-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NOPOPCNT-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NOPOPCNT-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrlq $4, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrlq $8, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrlq $16, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NOPOPCNT-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NOPOPCNT-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NOPOPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NOPOPCNT-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 ; AVX-NOPOPCNT-NEXT: retq %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) ret <2 x i64> %out } define <16 x i8> @testv16i8(<16 x i8> %in) { -; AVX-POPCNT-LABEL: testv16i8: -; AVX-POPCNT: # BB#0: -; AVX-POPCNT-NEXT: vpextrb $1, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpextrb $0, %xmm0, %ecx -; AVX-POPCNT-NEXT: popcntw %cx, %cx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm1 -; AVX-POPCNT-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $2, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $3, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $4, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $5, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $6, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $7, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $8, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $9, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $10, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $11, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $12, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $13, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $14, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrb $15, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 -; AVX-POPCNT-NEXT: retq -; -; AVX-NOPOPCNT-LABEL: testv16i8: -; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vpextrb $1, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpextrb $0, %xmm0, %ecx -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: shrb %dl -; AVX-NOPOPCNT-NEXT: andb $85, %dl -; AVX-NOPOPCNT-NEXT: subb %dl, %cl -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: andb $51, %dl -; AVX-NOPOPCNT-NEXT: shrb $2, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: addb %dl, %cl -; AVX-NOPOPCNT-NEXT: movb %cl, %dl -; AVX-NOPOPCNT-NEXT: shrb $4, %dl -; AVX-NOPOPCNT-NEXT: addb %cl, %dl -; AVX-NOPOPCNT-NEXT: andb $15, %dl -; AVX-NOPOPCNT-NEXT: movzbl %dl, %ecx -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm1 -; AVX-NOPOPCNT-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $2, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $3, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $4, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $5, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $6, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $7, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $8, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $9, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $10, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $11, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $12, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $13, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $14, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrb $15, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb %cl -; AVX-NOPOPCNT-NEXT: andb $85, %cl -; AVX-NOPOPCNT-NEXT: subb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $51, %cl -; AVX-NOPOPCNT-NEXT: shrb $2, %al -; AVX-NOPOPCNT-NEXT: andb $51, %al -; AVX-NOPOPCNT-NEXT: addb %cl, %al -; AVX-NOPOPCNT-NEXT: movb %al, %cl -; AVX-NOPOPCNT-NEXT: shrb $4, %cl -; AVX-NOPOPCNT-NEXT: addb %al, %cl -; AVX-NOPOPCNT-NEXT: andb $15, %cl -; AVX-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX-NOPOPCNT-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 -; AVX-NOPOPCNT-NEXT: retq +; AVX-LABEL: testv16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) ret <16 x i8> %out } define <16 x i16> @testv16i16(<16 x i16> %in) { -; AVX-POPCNT-LABEL: testv16i16: -; AVX-POPCNT: # BB#0: -; AVX-POPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $1, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vmovd %xmm1, %ecx -; AVX-POPCNT-NEXT: popcntw %cx, %cx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-POPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $2, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $3, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $4, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $5, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $6, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $7, %xmm1, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $1, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vmovd %xmm0, %ecx -; AVX-POPCNT-NEXT: popcntw %cx, %cx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-POPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $2, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $3, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $4, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $5, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $6, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX-POPCNT-NEXT: vpextrw $7, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX-POPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-POPCNT-NEXT: retq -; -; AVX-NOPOPCNT-LABEL: testv16i16: -; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $1, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vmovd %xmm1, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl %edx -; AVX-NOPOPCNT-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %edx -; AVX-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ch, %ecx # NOREX -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-NOPOPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $2, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $3, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $4, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $5, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $6, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $7, %xmm1, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vmovd %xmm0, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl %edx -; AVX-NOPOPCNT-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %edx -; AVX-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ch, %ecx # NOREX -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX-NOPOPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX-NOPOPCNT-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX-NOPOPCNT-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NOPOPCNT-NEXT: retq +; AVX-LABEL: testv16i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm5 +; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm5 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out } define <8 x i16> @testv8i16(<8 x i16> %in) { -; AVX-POPCNT-LABEL: testv8i16: -; AVX-POPCNT: # BB#0: -; AVX-POPCNT-NEXT: vpextrw $1, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vmovd %xmm0, %ecx -; AVX-POPCNT-NEXT: popcntw %cx, %cx -; AVX-POPCNT-NEXT: vmovd %ecx, %xmm1 -; AVX-POPCNT-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $2, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $3, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $4, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $5, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $6, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-POPCNT-NEXT: vpextrw $7, %xmm0, %eax -; AVX-POPCNT-NEXT: popcntw %ax, %ax -; AVX-POPCNT-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX-POPCNT-NEXT: retq -; -; AVX-NOPOPCNT-LABEL: testv8i16: -; AVX-NOPOPCNT: # BB#0: -; AVX-NOPOPCNT-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vmovd %xmm0, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: shrl %edx -; AVX-NOPOPCNT-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %edx -; AVX-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX-NOPOPCNT-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ch, %ecx # NOREX -; AVX-NOPOPCNT-NEXT: vmovd %ecx, %xmm1 -; AVX-NOPOPCNT-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $4, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $5, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $6, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 -; AVX-NOPOPCNT-NEXT: vpextrw $7, %xmm0, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: shrl %ecx -; AVX-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: shrl $2, %eax -; AVX-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX-NOPOPCNT-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 -; AVX-NOPOPCNT-NEXT: retq +; AVX-LABEL: testv8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX-NEXT: retq %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) ret <8 x i16> %out } Index: test/CodeGen/X86/avx2-popcnt.ll =================================================================== --- test/CodeGen/X86/avx2-popcnt.ll +++ test/CodeGen/X86/avx2-popcnt.ll @@ -5,674 +5,35 @@ ; of popcnt instruction availability. define <32 x i8> @testv32i8(<32 x i8> %in) { -; AVX2-POPCNT-LABEL: testv32i8: -; AVX2-POPCNT: # BB#0: -; AVX2-POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-POPCNT-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpextrb $0, %xmm1, %ecx -; AVX2-POPCNT-NEXT: popcntw %cx, %cx -; AVX2-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-POPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX2-POPCNT-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpextrb $0, %xmm0, %ecx -; AVX2-POPCNT-NEXT: popcntw %cx, %cx -; AVX2-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-POPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; AVX2-POPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-POPCNT-NEXT: retq -; -; AVX2-NOPOPCNT-LABEL: testv32i8: -; AVX2-NOPOPCNT: # BB#0: -; AVX2-NOPOPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NOPOPCNT-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpextrb $0, %xmm1, %ecx -; AVX2-NOPOPCNT-NEXT: movb %cl, %dl -; AVX2-NOPOPCNT-NEXT: shrb %dl -; AVX2-NOPOPCNT-NEXT: andb $85, %dl -; AVX2-NOPOPCNT-NEXT: subb %dl, %cl -; AVX2-NOPOPCNT-NEXT: movb %cl, %dl -; AVX2-NOPOPCNT-NEXT: andb $51, %dl -; AVX2-NOPOPCNT-NEXT: shrb $2, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: addb %dl, %cl -; AVX2-NOPOPCNT-NEXT: movb %cl, %dl -; AVX2-NOPOPCNT-NEXT: shrb $4, %dl -; AVX2-NOPOPCNT-NEXT: addb %cl, %dl -; AVX2-NOPOPCNT-NEXT: andb $15, %dl -; AVX2-NOPOPCNT-NEXT: movzbl %dl, %ecx -; AVX2-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 -; AVX2-NOPOPCNT-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpextrb $0, %xmm0, %ecx -; AVX2-NOPOPCNT-NEXT: movb %cl, %dl -; AVX2-NOPOPCNT-NEXT: shrb %dl -; AVX2-NOPOPCNT-NEXT: andb $85, %dl -; AVX2-NOPOPCNT-NEXT: subb %dl, %cl -; AVX2-NOPOPCNT-NEXT: movb %cl, %dl -; AVX2-NOPOPCNT-NEXT: andb $51, %dl -; AVX2-NOPOPCNT-NEXT: shrb $2, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: addb %dl, %cl -; AVX2-NOPOPCNT-NEXT: movb %cl, %dl -; AVX2-NOPOPCNT-NEXT: shrb $4, %dl -; AVX2-NOPOPCNT-NEXT: addb %cl, %dl -; AVX2-NOPOPCNT-NEXT: andb $15, %dl -; AVX2-NOPOPCNT-NEXT: movzbl %dl, %ecx -; AVX2-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb %cl -; AVX2-NOPOPCNT-NEXT: andb $85, %cl -; AVX2-NOPOPCNT-NEXT: subb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $51, %cl -; AVX2-NOPOPCNT-NEXT: shrb $2, %al -; AVX2-NOPOPCNT-NEXT: andb $51, %al -; AVX2-NOPOPCNT-NEXT: addb %cl, %al -; AVX2-NOPOPCNT-NEXT: movb %al, %cl -; AVX2-NOPOPCNT-NEXT: shrb $4, %cl -; AVX2-NOPOPCNT-NEXT: addb %al, %cl -; AVX2-NOPOPCNT-NEXT: andb $15, %cl -; AVX2-NOPOPCNT-NEXT: movzbl %cl, %eax -; AVX2-NOPOPCNT-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 -; AVX2-NOPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: retq +; AVX2-LABEL: testv32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } define <4 x i64> @testv4i64(<4 x i64> %in) { -; AVX2-POPCNT-LABEL: testv4i64: -; AVX2-POPCNT: # BB#0: -; AVX2-POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-POPCNT-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-POPCNT-NEXT: popcntq %rax, %rax -; AVX2-POPCNT-NEXT: vmovq %rax, %xmm2 -; AVX2-POPCNT-NEXT: vmovq %xmm1, %rax -; AVX2-POPCNT-NEXT: popcntq %rax, %rax -; AVX2-POPCNT-NEXT: vmovq %rax, %xmm1 -; AVX2-POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-POPCNT-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-POPCNT-NEXT: popcntq %rax, %rax -; AVX2-POPCNT-NEXT: vmovq %rax, %xmm2 -; AVX2-POPCNT-NEXT: vmovq %xmm0, %rax -; AVX2-POPCNT-NEXT: popcntq %rax, %rax -; AVX2-POPCNT-NEXT: vmovq %rax, %xmm0 -; AVX2-POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-POPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-POPCNT-NEXT: retq -; -; AVX2-NOPOPCNT-LABEL: testv4i64: -; AVX2-NOPOPCNT: # BB#0: -; AVX2-NOPOPCNT-NEXT: vpsrlq $1, %ymm0, %ymm1 -; AVX2-NOPOPCNT-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 -; AVX2-NOPOPCNT-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NOPOPCNT-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NOPOPCNT-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NOPOPCNT-NEXT: vpsrlq $2, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpaddq %ymm0, %ymm2, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpsrlq $4, %ymm0, %ymm1 -; AVX2-NOPOPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NOPOPCNT-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpsrlq $8, %ymm0, %ymm1 -; AVX2-NOPOPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpsrlq $16, %ymm0, %ymm1 -; AVX2-NOPOPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpsrlq $32, %ymm0, %ymm1 -; AVX2-NOPOPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; AVX2-NOPOPCNT-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: retq +; AVX2-LABEL: testv4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) ret <4 x i64> %out } @@ -680,378 +41,42 @@ define <8 x i32> @testv8i32(<8 x i32> %in) { ; AVX2-LABEL: testv8i32: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrld $2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpsrld $4, %ymm0, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $8, %ymm0, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out } define <16 x i16> @testv16i16(<16 x i16> %in) { -; AVX2-POPCNT-LABEL: testv16i16: -; AVX2-POPCNT: # BB#0: -; AVX2-POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-POPCNT-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vmovd %xmm1, %ecx -; AVX2-POPCNT-NEXT: popcntw %cx, %cx -; AVX2-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-POPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX2-POPCNT-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vmovd %xmm0, %ecx -; AVX2-POPCNT-NEXT: popcntw %cx, %cx -; AVX2-POPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-POPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-POPCNT-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-POPCNT-NEXT: popcntw %ax, %ax -; AVX2-POPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX2-POPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-POPCNT-NEXT: retq -; -; AVX2-NOPOPCNT-LABEL: testv16i16: -; AVX2-NOPOPCNT: # BB#0: -; AVX2-NOPOPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NOPOPCNT-NEXT: vpextrw $1, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vmovd %xmm1, %ecx -; AVX2-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: shrl %edx -; AVX2-NOPOPCNT-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX2-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX2-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %edx -; AVX2-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ch, %ecx # NOREX -; AVX2-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $2, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $3, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $4, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $5, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $6, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $7, %xmm1, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 -; AVX2-NOPOPCNT-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vmovd %xmm0, %ecx -; AVX2-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: shrl %edx -; AVX2-NOPOPCNT-NEXT: andl $21845, %edx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %edx, %ecx -; AVX2-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: andl $13107, %edx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %edx, %ecx -; AVX2-NOPOPCNT-NEXT: movl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: andl $65520, %edx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %edx -; AVX2-NOPOPCNT-NEXT: addl %ecx, %edx -; AVX2-NOPOPCNT-NEXT: andl $3855, %edx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %edx, %ecx # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ch, %ecx # NOREX -; AVX2-NOPOPCNT-NEXT: vmovd %ecx, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $4, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $5, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $6, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 -; AVX2-NOPOPCNT-NEXT: vpextrw $7, %xmm0, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: shrl %ecx -; AVX2-NOPOPCNT-NEXT: andl $21845, %ecx # imm = 0x5555 -; AVX2-NOPOPCNT-NEXT: subl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $13107, %ecx # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: shrl $2, %eax -; AVX2-NOPOPCNT-NEXT: andl $13107, %eax # imm = 0x3333 -; AVX2-NOPOPCNT-NEXT: addl %ecx, %eax -; AVX2-NOPOPCNT-NEXT: movl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $65520, %ecx # imm = 0xFFF0 -; AVX2-NOPOPCNT-NEXT: shrl $4, %ecx -; AVX2-NOPOPCNT-NEXT: addl %eax, %ecx -; AVX2-NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F -; AVX2-NOPOPCNT-NEXT: imull $257, %ecx, %eax # imm = 0x101 -; AVX2-NOPOPCNT-NEXT: movzbl %ah, %eax # NOREX -; AVX2-NOPOPCNT-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 -; AVX2-NOPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NOPOPCNT-NEXT: retq +; AVX2-LABEL: testv16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX2-NEXT: retq %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) ret <16 x i16> %out }