Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -991,6 +991,14 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each vector + // element. If popcnt isn't supported, always provide the custom version. + if (!Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + } + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -1250,6 +1258,13 @@ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each vector + // element. If popcnt isn't supported, always provide the custom version. + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); + if (!Subtarget->hasPOPCNT()) + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); @@ -18852,6 +18867,153 @@ return SDValue(); } +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + + Op = Op.getOperand(0); + EVT VT = Op.getValueType(); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "CTPOP lowering only implemented for 128/256-bit wide vector types"); + + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + unsigned Len = EltVT.getSizeInBits(); + + // This is the vectorized version of the "best" algorithm from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // with a minor tweak to use a series of adds + shifts instead of vector + // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: + // + // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled + // v8i32 => Always profitable + // + // FIXME: There a couple of possible improvements: + // + // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). + // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html + // + assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && + "CTPOP not implemented for this vector element type."); + + if (VT.is256BitVector() && !Subtarget->hasInt256()) { + // Split v8i32 into two 128-bit CTPOP and merge back the result. This + // is slightly better than using popcnt on each element on haswell. Also + // split v4i64 into two 128-bit CTPOP when popcnt is disabled to save + // a few extracts and inserts. + SDValue V0 = Extract128BitVector(Op, 0, DAG, dl); + SDValue V1 = Extract128BitVector(Op, NumElts/2, DAG, dl); + V0 = LowerCTPOP(DAG.getNode(ISD::CTPOP, dl, V0.getValueType(), V0), + Subtarget, DAG); + V1 = LowerCTPOP(DAG.getNode(ISD::CTPOP, dl, V1.getValueType(), V1), + Subtarget, DAG); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V0, V1); + } + + // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid + // extra legalization. + bool NeedsBitcast = EltVT == MVT::i32; + MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; + + SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); + SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); + SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); + + // v = v - ((v >> 1) & 0x55555555...) + SmallVector Ones(NumElts, DAG.getConstant(1, EltVT)); + SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); + SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); + if (NeedsBitcast) + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + + SmallVector Mask55(NumElts, Cst55); + SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); + if (NeedsBitcast) + M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); + + SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + SmallVector Mask33(NumElts, Cst33); + SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); + SmallVector Twos(NumElts, DAG.getConstant(2, EltVT)); + SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); + + Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); + if (NeedsBitcast) { + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); + Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); + } + + SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); + SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); + if (VT != AndRHS.getValueType()) { + AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); + AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); + } + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + SmallVector Fours(NumElts, DAG.getConstant(4, EltVT)); + SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + + SmallVector Mask0F(NumElts, Cst0F); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); + } + And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + // The algorithm mentioned above uses: + // v = (v * 0x01010101...) >> (Len - 8) + // + // Change it to use vector adds + vector shifts which yield faster results on + // Haswell than using vector integer multiplication. + // + // For i32 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // + // For i64 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // v = v + (v >> 32) + // + Add = And; + SmallVector Csts; + for (unsigned i = 8; i <= Len/2; i *= 2) { + Csts.assign(NumElts, DAG.getConstant(i, EltVT)); + SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + Csts.clear(); + } + + // The result is on the least significant 6-bits on i32 and 7-bits on i64. + SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); + SmallVector Cst3FV(NumElts, Cst3F); + SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); + } + And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + return And; +} + static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); @@ -18979,6 +19141,7 @@ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); Index: test/CodeGen/X86/vector-ctpop.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/vector-ctpop.ll @@ -0,0 +1,238 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck -check-prefix=AVX1 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck -check-prefix=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s + +; Vector version of: +; v = v - ((v >> 1) & 0x55555555) +; v = (v & 0x33333333) + ((v >> 2) & 0x33333333) +; v = (v + (v >> 4) & 0xF0F0F0F) +; v = v + (v >> 8) +; v = v + (v >> 16) +; v = v + (v >> 32) ; i64 only + +define <8 x i32> @test0(<8 x i32> %x) { +; AVX2-LABEL: @test0 +; AVX1-LABEL: @test0 +entry: +; AVX2: vpsrld $1, %ymm +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX2-NEXT: vpsubd +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX2-NEXT: vpsrld $2 +; AVX2-NEXT: vpand +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpsrld $4 +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX2-NEXT: vpsrld $8 +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpsrld $16 +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX1: vextractf128 $1, %ymm +; AVX1-NEXT: vpsrld $1, %xmm +; AVX1-NEXT: vmovdqa +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsubd +; AVX1-NEXT: vmovdqa +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsrld $2 +; AVX1-NEXT: vpand +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vpsrld $4 +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vmovdqa +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsrld $8 +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vpsrld $16 +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vmovdqa +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsrld $1, %xmm +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsubd +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsrld $2 +; AVX1-NEXT: vpand +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vpsrld $4 +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vpand +; AVX1-NEXT: vpsrld $8 +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vpsrld $16 +; AVX1-NEXT: vpaddd +; AVX1-NEXT: vpand +; AVX1-NEXT: vinsertf128 $1 + %y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x) + ret <8 x i32> %y +} + +define <4 x i64> @test1(<4 x i64> %x) { +; AVX2-NOPOPCNT-LABEL: @test1 +; AVX1-NOPOPCNT-LABEL: @test1 +entry: +; AVX2-NOPOPCNT: vpsrlq $1, %ymm +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsubq +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $2 +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $4 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $8 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $16 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $32 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT: vextractf128 $1, %ymm +; AVX1-NOPOPCNT-NEXT: vpsrlq $1, %xmm +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsubq +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $2 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $4 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $8 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $16 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $32 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $1 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsubq +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $2 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $4 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $8 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $16 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $32 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vinsertf128 $1, %xmm + %y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x) + ret <4 x i64> %y +} + +define <4 x i32> @test2(<4 x i32> %x) { +; AVX2-NOPOPCNT-LABEL: @test2 +; AVX1-NOPOPCNT-LABEL: @test2 +entry: +; AVX2-NOPOPCNT: vpsrld $1, %xmm +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsubd +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrld $2 +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpsrld $4 +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrld $8 +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpsrld $16 +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT: vpsrld $1, %xmm +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsubd +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrld $2 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpsrld $4 +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrld $8 +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpsrld $16 +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpand + %y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x) + ret <4 x i32> %y +} + +define <2 x i64> @test3(<2 x i64> %x) { +; AVX2-NOPOPCNT-LABEL: @test3 +; AVX1-NOPOPCNT-LABEL: @test3 +entry: +; AVX2-NOPOPCNT: vpsrlq $1, %xmm +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsubq +; AVX2-NOPOPCNT-NEXT: vmovdqa +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $2 +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $4 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $8 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $16 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $32 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT: vpsrlq $1, %xmm +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsubq +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $2 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $4 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $8 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $16 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $32 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpand + %y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x) + ret <2 x i64> %y +} + +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) + +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) +