Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -414,6 +414,8 @@ setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); + else + setOperationAction(ISD::CTPOP , MVT::i64 , Custom); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); @@ -26702,6 +26704,22 @@ switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case ISD::CTPOP: + assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); + // Use a v2i64 if possible. + if (isTypeLegal(MVT::v2i64)) { + SDValue Wide = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + N->getOperand(0)); + Wide = DAG.getNode(N->getOpcode(), dl, MVT::v2i64, Wide); + // Bit count should fit in 32-bits, extract it as that and then zero + // extend to i64. + Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); + Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, + DAG.getIntPtrConstant(0, dl)); + Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); + Results.push_back(Wide); + } + return; case ISD::MUL: { EVT VT = N->getValueType(0); assert(VT.isVector() && "Unexpected VT"); Index: llvm/test/CodeGen/X86/popcnt.ll =================================================================== --- llvm/test/CodeGen/X86/popcnt.ll +++ llvm/test/CodeGen/X86/popcnt.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=X32,X32-NOSSE ; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X32-POPCNT ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt | FileCheck %s --check-prefix=X64-POPCNT +; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 | FileCheck %s --check-prefixes=X32,X32-SSE2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 | FileCheck %s --check-prefixes=X32,X32-SSSE3 define i8 @cnt8(i8 %x) nounwind readnone { ; X32-LABEL: cnt8: @@ -172,43 +174,43 @@ } define i64 @cnt64(i64 %x) nounwind readnone { -; X32-LABEL: cnt64: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: shrl %edx -; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X32-NEXT: subl %edx, %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NEXT: shrl $2, %ecx -; X32-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: shrl $4, %edx -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 -; X32-NEXT: shrl $24, %ecx -; X32-NEXT: movl %eax, %edx -; X32-NEXT: shrl %edx -; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555 -; X32-NEXT: subl %edx, %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X32-NEXT: shrl $2, %eax -; X32-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X32-NEXT: addl %edx, %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: shrl $4, %edx -; X32-NEXT: addl %eax, %edx -; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 -; X32-NEXT: shrl $24, %eax -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: xorl %edx, %edx -; X32-NEXT: retl +; X32-NOSSE-LABEL: cnt64: +; X32-NOSSE: # %bb.0: +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NOSSE-NEXT: movl %ecx, %edx +; X32-NOSSE-NEXT: shrl %edx +; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X32-NOSSE-NEXT: subl %edx, %ecx +; X32-NOSSE-NEXT: movl %ecx, %edx +; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X32-NOSSE-NEXT: shrl $2, %ecx +; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 +; X32-NOSSE-NEXT: addl %edx, %ecx +; X32-NOSSE-NEXT: movl %ecx, %edx +; X32-NOSSE-NEXT: shrl $4, %edx +; X32-NOSSE-NEXT: addl %ecx, %edx +; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %ecx +; X32-NOSSE-NEXT: movl %eax, %edx +; X32-NOSSE-NEXT: shrl %edx +; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X32-NOSSE-NEXT: subl %edx, %eax +; X32-NOSSE-NEXT: movl %eax, %edx +; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X32-NOSSE-NEXT: shrl $2, %eax +; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X32-NOSSE-NEXT: addl %edx, %eax +; X32-NOSSE-NEXT: movl %eax, %edx +; X32-NOSSE-NEXT: shrl $4, %edx +; X32-NOSSE-NEXT: addl %eax, %edx +; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X32-NOSSE-NEXT: shrl $24, %eax +; X32-NOSSE-NEXT: addl %ecx, %eax +; X32-NOSSE-NEXT: xorl %edx, %edx +; X32-NOSSE-NEXT: retl ; ; X64-LABEL: cnt64: ; X64: # %bb.0: @@ -245,6 +247,48 @@ ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntq %rdi, %rax ; X64-POPCNT-NEXT: retq +; +; X32-SSE2-LABEL: cnt64: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrlw $1, %xmm1 +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE2-NEXT: psubb %xmm1, %xmm0 +; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X32-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X32-SSE2-NEXT: pand %xmm1, %xmm2 +; X32-SSE2-NEXT: psrlw $2, %xmm0 +; X32-SSE2-NEXT: pand %xmm1, %xmm0 +; X32-SSE2-NEXT: paddb %xmm2, %xmm0 +; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE2-NEXT: psrlw $4, %xmm1 +; X32-SSE2-NEXT: paddb %xmm0, %xmm1 +; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1 +; X32-SSE2-NEXT: pxor %xmm0, %xmm0 +; X32-SSE2-NEXT: psadbw %xmm1, %xmm0 +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: xorl %edx, %edx +; X32-SSE2-NEXT: retl +; +; X32-SSSE3-LABEL: cnt64: +; X32-SSSE3: # %bb.0: +; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2 +; X32-SSSE3-NEXT: pand %xmm0, %xmm2 +; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4 +; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4 +; X32-SSSE3-NEXT: psrlw $4, %xmm1 +; X32-SSSE3-NEXT: pand %xmm0, %xmm1 +; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3 +; X32-SSSE3-NEXT: paddb %xmm4, %xmm3 +; X32-SSSE3-NEXT: pxor %xmm0, %xmm0 +; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0 +; X32-SSSE3-NEXT: movd %xmm0, %eax +; X32-SSSE3-NEXT: xorl %edx, %edx +; X32-SSSE3-NEXT: retl %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt }