Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -275,6 +275,10 @@ return false; } + /// Return true if it is cheaper for the target to expand vector population + /// count by unrolling the operation instead of using vector bit manipulation. + virtual bool isCheapToUnrollVectorPopCount(EVT VT) const { return true; } + /// \brief Return if the target supports combining a /// chain like: /// \code Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -75,6 +75,10 @@ /// \brief Implement expansion for SIGN_EXTEND_INREG using SRL and SRA. SDValue ExpandSEXTINREG(SDValue Op); + /// \brief Implement expansion for CTPOP with parallel bitmath using SRL, AND, + /// SUB and ADD. + SDValue ExpandCTPOP(SDValue Op); + /// \brief Implement expansion for ANY_EXTEND_VECTOR_INREG. /// /// Shuffles the low lanes of the operand into place and bitcasts to the proper @@ -698,6 +702,8 @@ return ExpandUINT_TO_FLOAT(Op); case ISD::FNEG: return ExpandFNEG(Op); + case ISD::CTPOP: + return ExpandCTPOP(Op); case ISD::SETCC: return UnrollVSETCC(Op); default: @@ -763,6 +769,107 @@ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val); } +SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) { + EVT VT = Op.getValueType(); + EVT EltVT = VT.getVectorElementType(); + unsigned EltSizeInBits = EltVT.getSizeInBits(); + + // Current algorithm does not support element sizes greater than 128 nor + // can handle elements non byte multiples. + if (EltSizeInBits > 128 || EltSizeInBits % 8 != 0) + return DAG.UnrollVectorOp(Op.getNode()); + + // Assume that the bitmath expasion implemented below is better than unrolling + // CTPOP giving certain conditions but allow targets to decide otherwise. + if (TLI.isCheapToUnrollVectorPopCount(VT) || + TLI.getOperationAction(ISD::SRL, VT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::SUB, VT) == TargetLowering::Expand || + TLI.getOperationAction(ISD::ADD, VT) == TargetLowering::Expand) + return DAG.UnrollVectorOp(Op.getNode()); + + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + Op = Op.getOperand(0); + unsigned NumElts = VT.getVectorNumElements(); + + // This is the vectorized version of the "best" algorithm from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // with a minor tweak to use a series of adds + shifts instead of vector + // multiplications. + SDValue Cst55 = DAG.getConstant( + APInt::getSplat(EltSizeInBits, APInt(8, 0x55)), dl, EltVT); + SDValue Cst33 = DAG.getConstant( + APInt::getSplat(EltSizeInBits, APInt(8, 0x33)), dl, EltVT); + SDValue Cst0F = DAG.getConstant( + APInt::getSplat(EltSizeInBits, APInt(8, 0x0F)), dl, EltVT); + + SmallVector Ones(NumElts, DAG.getConstant(1, dl, EltVT)); + SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); + + SmallVector Mask55(NumElts, Cst55); + SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); + + SmallVector Mask33(NumElts, Cst33); + SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); + + SmallVector Two(NumElts, DAG.getConstant(2, dl, EltVT)); + SDValue TwoV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Two); + + SmallVector Four(NumElts, DAG.getConstant(4, dl, EltVT)); + SDValue FourV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Four); + + SmallVector Mask0F(NumElts, Cst0F); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); + + // v = v - ((v >> 1) & 0x55555555...) + SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); + SDValue And = DAG.getNode(ISD::AND, dl, VT, Srl, M55); + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwoV); + SDValue AndRHS = DAG.getNode(ISD::AND, dl, VT, Srl, M33); + SDValue AndLHS = DAG.getNode(ISD::AND, dl, VT, Sub, M33); + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FourV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + And = DAG.getNode(ISD::AND, dl, VT, Add, M0F); + + // One way to compute the final step is to use MUL: + // v = (v * 0x01010101...) >> (EltSizeInBits - 8) + // + // We use shift instead: + // + // For i16 elements: + // v = v + (v >> 8) + // + // For i32 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // + // For i64 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // v = v + (v >> 32) + // + Add = And; + for (unsigned i = 8; i <= EltSizeInBits/2; i *= 2) { + SmallVector Csts(NumElts, DAG.getConstant(i, dl, EltVT)); + SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + } + + // Mask and only select relevant bits for each element. + SDValue MaskRes = DAG.getConstant((EltSizeInBits << 1) - 1, dl, EltVT); + SmallVector ByteMaskResV(NumElts, MaskRes); + return DAG.getNode(ISD::AND, dl, VT, Add, + DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ByteMaskResV)); +} + SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) { EVT VT = Op.getValueType(); Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -658,6 +658,11 @@ bool isCheapToSpeculateCtlz() const override; + /// Return true if it is cheaper for the target to expand vector population + /// count by unrolling the operation instead of using vector bit + /// manipulation. + bool isCheapToUnrollVectorPopCount(EVT VT) const override; + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1145,6 +1145,10 @@ // Always custom lower if avx2 is available. setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + // It is faster to use the general vector expansion for v8i32+avx2 + // than to custom lower or unroll to several scalar ctpop. + setOperationAction(ISD::CTPOP, MVT::v8i32, Expand); + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); @@ -1655,6 +1659,21 @@ return VT.changeVectorElementTypeToInteger(); } +bool X86TargetLowering::isCheapToUnrollVectorPopCount(EVT VT) const { + assert(VT.isVector() && "Should only handle vector pop count"); + EVT EltVT = VT.getVectorElementType(); + + // Never unroll if scalar ctpop is not available + if (getOperationAction(ISD::CTPOP, EltVT) == TargetLowering::Expand) + return false; + + // Unrolling is better than custom lowering and vector bitmath + if (VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4i64) + return true; + + return false; +} + /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { Index: test/CodeGen/X86/avx2-popcnt.ll =================================================================== --- test/CodeGen/X86/avx2-popcnt.ll +++ test/CodeGen/X86/avx2-popcnt.ll @@ -43,23 +43,25 @@ define <8 x i32> @testv8i32(<8 x i32> %in) { ; CHECK-LABEL: testv8i32: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: vpsrld $1, %ymm0, %ymm1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $2, %ymm0, %ymm0 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: vpsrld $4, %ymm0, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $8, %ymm0, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $16, %ymm0, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm1 -; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vpsadbw %ymm1, %ymm2, %ymm1 -; CHECK-NEXT: vpsrlq $32, %ymm0, %ymm0 -; CHECK-NEXT: vpsadbw %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0 -; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) ret <8 x i32> %out Index: test/CodeGen/X86/sse2-popcnt.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/sse2-popcnt.ll @@ -0,0 +1,121 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2,-ssse3,-sse4.2,-popcnt | FileCheck %s + +define <16 x i8> @testv16i8(<16 x i8> %in) { +; CHECK-LABEL: testv16i8: +; CHECK: # BB#0: +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: psubb %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddb %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: paddb %xmm0, %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retq + %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) + ret <16 x i8> %out +} + +define <4 x i32> @testv4i32(<4 x i32> %in) { +; CHECK-LABEL: testv4i32: +; CHECK: # BB#0: +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $1, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: psubd %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrld $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddd %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrld $4, %xmm1 +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrld $8, %xmm2 +; CHECK-NEXT: paddd %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: paddd %xmm2, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) + ret <4 x i32> %out +} + +define <2 x i64> @testv2i64(<2 x i64> %in) { +; CHECK-LABEL: testv2i64: +; CHECK: # BB#0: +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $1, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: psubq %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlq $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddq %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $4, %xmm1 +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $8, %xmm0 +; CHECK-NEXT: paddq %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlq $16, %xmm1 +; CHECK-NEXT: paddq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: paddq %xmm1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) + ret <2 x i64> %out +} + +define <8 x i16> @testv8i16(<8 x i16> %in) { +; CHECK-LABEL: testv8i16: +; CHECK: # BB#0: +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $1, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: psubw %xmm1, %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: psrlw $2, %xmm0 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: paddw %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrlw $4, %xmm1 +; CHECK-NEXT: paddw %xmm0, %xmm1 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: psrlw $8, %xmm0 +; CHECK-NEXT: paddw %xmm1, %xmm0 +; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) + ret <8 x i16> %out +} + +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) Index: test/CodeGen/X86/sse42-popcnt.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/sse42-popcnt.ll @@ -0,0 +1,160 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+popcnt | FileCheck -check-prefix=SSE42 %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,-popcnt | FileCheck -check-prefix=SSE42-NOPOPCNT %s + +define <16 x i8> @testv16i8(<16 x i8> %in) { +; SSE42-LABEL: testv16i8: +; SSE42: # BB#0: +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NEXT: pand %xmm2, %xmm3 +; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NEXT: pshufb %xmm3, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm0 +; SSE42-NEXT: pand %xmm2, %xmm0 +; SSE42-NEXT: pand %xmm2, %xmm0 +; SSE42-NEXT: pshufb %xmm0, %xmm1 +; SSE42-NEXT: paddb %xmm4, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NOPOPCNT-LABEL: testv16i8: +; SSE42-NOPOPCNT: # BB#0: +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NOPOPCNT-NEXT: pand %xmm2, %xmm3 +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm3, %xmm4 +; SSE42-NOPOPCNT-NEXT: psrlw $4, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm2, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm2, %xmm0 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm0, %xmm1 +; SSE42-NOPOPCNT-NEXT: paddb %xmm4, %xmm1 +; SSE42-NOPOPCNT-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: retq + %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) + ret <16 x i8> %out +} + +define <4 x i32> @testv4i32(<4 x i32> %in) { +; SSE42-LABEL: testv4i32: +; SSE42: # BB#0: +; SSE42-NEXT: pextrd $1, %xmm0, %eax +; SSE42-NEXT: popcntl %eax, %eax +; SSE42-NEXT: movd %xmm0, %ecx +; SSE42-NEXT: popcntl %ecx, %ecx +; SSE42-NEXT: movd %ecx, %xmm1 +; SSE42-NEXT: pinsrd $1, %eax, %xmm1 +; SSE42-NEXT: pextrd $2, %xmm0, %eax +; SSE42-NEXT: popcntl %eax, %eax +; SSE42-NEXT: pinsrd $2, %eax, %xmm1 +; SSE42-NEXT: pextrd $3, %xmm0, %eax +; SSE42-NEXT: popcntl %eax, %eax +; SSE42-NEXT: pinsrd $3, %eax, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NEXT: retq +; SSE42-NOPOPCNT-LABEL: testv4i32: +; SSE42-NOPOPCNT: # BB#0: +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm0, %xmm3 +; SSE42-NOPOPCNT-NEXT: pand %xmm2, %xmm3 +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm1, %xmm4 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm3, %xmm4 +; SSE42-NOPOPCNT-NEXT: psrlw $4, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm2, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm2, %xmm0 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm0, %xmm1 +; SSE42-NOPOPCNT-NEXT: paddb %xmm4, %xmm1 +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: pxor %xmm2, %xmm2 +; SSE42-NOPOPCNT-NEXT: psadbw %xmm2, %xmm0 +; SSE42-NOPOPCNT-NEXT: psrlq $32, %xmm1 +; SSE42-NOPOPCNT-NEXT: psadbw %xmm2, %xmm1 +; SSE42-NOPOPCNT-NEXT: psllq $32, %xmm1 +; SSE42-NOPOPCNT-NEXT: por %xmm0, %xmm1 +; SSE42-NOPOPCNT-NEXT: movdqa %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: retq + %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) + ret <4 x i32> %out +} + +define <2 x i64> @testv2i64(<2 x i64> %in) { +; SSE42-LABEL: testv2i64: +; SSE42: # BB#0: +; SSE42-NEXT: pextrq $1, %xmm0, %rax +; SSE42-NEXT: popcntq %rax, %rax +; SSE42-NEXT: movd %rax, %xmm1 +; SSE42-NEXT: movd %xmm0, %rax +; SSE42-NEXT: popcntq %rax, %rax +; SSE42-NEXT: movd %rax, %xmm0 +; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE42-NEXT: retq +; SSE42-NOPOPCNT-LABEL: testv2i64: +; SSE42-NOPOPCNT: # BB#0: +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm2 +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm3, %xmm4 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm2, %xmm4 +; SSE42-NOPOPCNT-NEXT: psrlw $4, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NOPOPCNT-NEXT: paddb %xmm4, %xmm3 +; SSE42-NOPOPCNT-NEXT: pxor %xmm0, %xmm0 +; SSE42-NOPOPCNT-NEXT: psadbw %xmm3, %xmm0 +; SSE42-NOPOPCNT-NEXT: retq + %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) + ret <2 x i64> %out +} + +define <8 x i16> @testv8i16(<8 x i16> %in) { +; SSE42-LABEL: testv8i16: +; SSE42: # BB#0: +; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pand %xmm1, %xmm2 +; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NEXT: movdqa %xmm3, %xmm4 +; SSE42-NEXT: pshufb %xmm2, %xmm4 +; SSE42-NEXT: psrlw $4, %xmm0 +; SSE42-NEXT: pand %xmm1, %xmm0 +; SSE42-NEXT: pand %xmm1, %xmm0 +; SSE42-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NEXT: paddb %xmm4, %xmm3 +; SSE42-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NEXT: paddb %xmm0, %xmm3 +; SSE42-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE42-NEXT: retq +; SSE42-NOPOPCNT-LABEL: testv8i16: +; SSE42-NOPOPCNT: # BB#0: +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm2 +; SSE42-NOPOPCNT-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE42-NOPOPCNT-NEXT: movdqa %xmm3, %xmm4 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm2, %xmm4 +; SSE42-NOPOPCNT-NEXT: psrlw $4, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: pand %xmm1, %xmm0 +; SSE42-NOPOPCNT-NEXT: pshufb %xmm0, %xmm3 +; SSE42-NOPOPCNT-NEXT: paddb %xmm4, %xmm3 +; SSE42-NOPOPCNT-NEXT: movdqa %xmm3, %xmm0 +; SSE42-NOPOPCNT-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NOPOPCNT-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE42-NOPOPCNT-NEXT: paddb %xmm0, %xmm3 +; SSE42-NOPOPCNT-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE42-NOPOPCNT-NEXT: retq + %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) + ret <8 x i16> %out +} + +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)