This is an archive of the discontinued LLVM Phabricator instance.

[X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if possible if popcnt instruction is not available
ClosedPublic

Authored by craig.topper on Mar 21 2019, 12:13 PM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon
andreadb

Commits

rGce1ed55a4a4a: [X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if…
rL356808: [X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if…

Summary

On 32-bit targets without popcnt, we currently expand 64-bit popcnt to sequences of arithmetic and logic ops for each 32-bit half and then add the 32 bit halves together. If we have xmm registers we can use use those to implement the operation instead. This results in less instructions then doing two separate 32-bit popcnt sequences.

Diff Detail

Repository: rL LLVM

Event Timeline

craig.topper created this revision.Mar 21 2019, 12:13 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 21 2019, 12:13 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

craig.topper added a reviewer: andreadb.Mar 21 2019, 1:00 PM

Make sure that NoImplicitFloat is not set before doing the transform.

LGTM

llvm/lib/Target/X86/X86ISelLowering.cpp
26715 ↗	(On Diff #191930)	Clearer to make this use the constant opcode getNode(ISD::CTPOP...) instead of using N->getOpcode() again.

This revision is now accepted and ready to land.Mar 22 2019, 1:09 PM

craig.topper marked an inline comment as done.Mar 22 2019, 1:31 PM

craig.topper added inline comments.

llvm/lib/Target/X86/X86ISelLowering.cpp
26715 ↗	(On Diff #191930)	Yeah I'll change that. When I wrote it I was thinking we might want to do this for cttz and ctlz too, but those expanded still use bsr/bsf or lzcnt/tzcnt so the vector version is probably worse.

Closed by commit rL356808: [X86] Use xmm registers to implement 64-bit popcnt on 32-bit targets if… (authored by ctopper). · Explain WhyMar 22 2019, 1:46 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

22 lines

test/

CodeGen/

X86/

popcnt.ll

132 lines

Diff 191943

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 408 Lines • ▼ Show 20 Lines	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasPOPCNT()) {		if (Subtarget.hasPOPCNT()) {
setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);		setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {		} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);		setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);		setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
setOperationAction(ISD::CTPOP , MVT::i32 , Expand);		setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
if (Subtarget.is64Bit())		if (Subtarget.is64Bit())
setOperationAction(ISD::CTPOP , MVT::i64 , Expand);		setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
		else
		setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
}		}

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);		setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

if (!Subtarget.hasMOVBE())		if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);		setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

// These should be promoted to a larger select which is supported.		// These should be promoted to a larger select which is supported.
▲ Show 20 Lines • Show All 26,285 Lines • ▼ Show 20 Lines
/// custom code.		/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,		void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>&Results,		SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc dl(N);		SDLoc dl(N);
switch (N->getOpcode()) {		switch (N->getOpcode()) {
default:		default:
llvm_unreachable("Do not know how to custom type legalize this operation!");		llvm_unreachable("Do not know how to custom type legalize this operation!");
		case ISD::CTPOP: {
		assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
		// Use a v2i64 if possible.
		bool NoImplicitFloatOps =
		DAG.getMachineFunction().getFunction().hasFnAttribute(
		Attribute::NoImplicitFloat);
		if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
		SDValue Wide =
		DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
		Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
		// Bit count should fit in 32-bits, extract it as that and then zero
		// extend to i64. Otherwise we end up extracting bits 63:32 separately.
		Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
		Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
		DAG.getIntPtrConstant(0, dl));
		Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
		Results.push_back(Wide);
		}
		return;
		}
case ISD::MUL: {		case ISD::MUL: {
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
assert(VT.isVector() && "Unexpected VT");		assert(VT.isVector() && "Unexpected VT");
if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&		if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
VT.getVectorNumElements() == 2) {		VT.getVectorNumElements() == 2) {
// Promote to a pattern that will be turned into PMULUDQ.		// Promote to a pattern that will be turned into PMULUDQ.
SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,		SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
N->getOperand(0));		N->getOperand(0));
▲ Show 20 Lines • Show All 17,161 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/popcnt.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s --check-prefix=X32			; RUN: llc < %s -mtriple=i686-unknown \| FileCheck %s --check-prefixes=X32,X32-NOSSE
	; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s --check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-unknown \| FileCheck %s --check-prefix=X64
	; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt \| FileCheck %s --check-prefix=X32-POPCNT			; RUN: llc < %s -mtriple=i686-unknown -mattr=+popcnt \| FileCheck %s --check-prefix=X32-POPCNT
	; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt \| FileCheck %s --check-prefix=X64-POPCNT			; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+popcnt \| FileCheck %s --check-prefix=X64-POPCNT
				; RUN: llc < %s -mtriple=i686-unknown -mattr=sse2 \| FileCheck %s --check-prefixes=X32,X32-SSE2
				; RUN: llc < %s -mtriple=i686-unknown -mattr=ssse3 \| FileCheck %s --check-prefixes=X32,X32-SSSE3

	define i8 @cnt8(i8 %x) nounwind readnone {			define i8 @cnt8(i8 %x) nounwind readnone {
	; X32-LABEL: cnt8:			; X32-LABEL: cnt8:
	; X32: # %bb.0:			; X32: # %bb.0:
	; X32-NEXT: movb {{[0-9]+}}(%esp), %cl			; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
	; X32-NEXT: movl %ecx, %eax			; X32-NEXT: movl %ecx, %eax
	; X32-NEXT: shrb %al			; X32-NEXT: shrb %al
	; X32-NEXT: andb $85, %al			; X32-NEXT: andb $85, %al
	▲ Show 20 Lines • Show All 153 Lines • ▼ Show 20 Lines
	; X64-POPCNT: # %bb.0:			; X64-POPCNT: # %bb.0:
	; X64-POPCNT-NEXT: popcntl %edi, %eax			; X64-POPCNT-NEXT: popcntl %edi, %eax
	; X64-POPCNT-NEXT: retq			; X64-POPCNT-NEXT: retq
	%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)			%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
	ret i32 %cnt			ret i32 %cnt
	}			}

	define i64 @cnt64(i64 %x) nounwind readnone {			define i64 @cnt64(i64 %x) nounwind readnone {
	; X32-LABEL: cnt64:			; X32-NOSSE-LABEL: cnt64:
				; X32-NOSSE: # %bb.0:
				; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X32-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X32-NOSSE-NEXT: movl %ecx, %edx
				; X32-NOSSE-NEXT: shrl %edx
				; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
				; X32-NOSSE-NEXT: subl %edx, %ecx
				; X32-NOSSE-NEXT: movl %ecx, %edx
				; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
				; X32-NOSSE-NEXT: shrl $2, %ecx
				; X32-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333
				; X32-NOSSE-NEXT: addl %edx, %ecx
				; X32-NOSSE-NEXT: movl %ecx, %edx
				; X32-NOSSE-NEXT: shrl $4, %edx
				; X32-NOSSE-NEXT: addl %ecx, %edx
				; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
				; X32-NOSSE-NEXT: imull $16843009, %edx, %ecx # imm = 0x1010101
				; X32-NOSSE-NEXT: shrl $24, %ecx
				; X32-NOSSE-NEXT: movl %eax, %edx
				; X32-NOSSE-NEXT: shrl %edx
				; X32-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555
				; X32-NOSSE-NEXT: subl %edx, %eax
				; X32-NOSSE-NEXT: movl %eax, %edx
				; X32-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333
				; X32-NOSSE-NEXT: shrl $2, %eax
				; X32-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333
				; X32-NOSSE-NEXT: addl %edx, %eax
				; X32-NOSSE-NEXT: movl %eax, %edx
				; X32-NOSSE-NEXT: shrl $4, %edx
				; X32-NOSSE-NEXT: addl %eax, %edx
				; X32-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
				; X32-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
				; X32-NOSSE-NEXT: shrl $24, %eax
				; X32-NOSSE-NEXT: addl %ecx, %eax
				; X32-NOSSE-NEXT: xorl %edx, %edx
				; X32-NOSSE-NEXT: retl
				;
				; X64-LABEL: cnt64:
				; X64: # %bb.0:
				; X64-NEXT: movq %rdi, %rax
				; X64-NEXT: shrq %rax
				; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
				; X64-NEXT: andq %rax, %rcx
				; X64-NEXT: subq %rcx, %rdi
				; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
				; X64-NEXT: movq %rdi, %rcx
				; X64-NEXT: andq %rax, %rcx
				; X64-NEXT: shrq $2, %rdi
				; X64-NEXT: andq %rax, %rdi
				; X64-NEXT: addq %rcx, %rdi
				; X64-NEXT: movq %rdi, %rax
				; X64-NEXT: shrq $4, %rax
				; X64-NEXT: leaq (%rax,%rdi), %rax
				; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
				; X64-NEXT: andq %rax, %rcx
				; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
				; X64-NEXT: imulq %rcx, %rax
				; X64-NEXT: shrq $56, %rax
				; X64-NEXT: retq
				;
				; X32-POPCNT-LABEL: cnt64:
				; X32-POPCNT: # %bb.0:
				; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
				; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
				; X32-POPCNT-NEXT: addl %ecx, %eax
				; X32-POPCNT-NEXT: xorl %edx, %edx
				; X32-POPCNT-NEXT: retl
				;
				; X64-POPCNT-LABEL: cnt64:
				; X64-POPCNT: # %bb.0:
				; X64-POPCNT-NEXT: popcntq %rdi, %rax
				; X64-POPCNT-NEXT: retq
				;
				; X32-SSE2-LABEL: cnt64:
				; X32-SSE2: # %bb.0:
				; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
				; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
				; X32-SSE2-NEXT: psrlw $1, %xmm1
				; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
				; X32-SSE2-NEXT: psubb %xmm1, %xmm0
				; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
				; X32-SSE2-NEXT: movdqa %xmm0, %xmm2
				; X32-SSE2-NEXT: pand %xmm1, %xmm2
				; X32-SSE2-NEXT: psrlw $2, %xmm0
				; X32-SSE2-NEXT: pand %xmm1, %xmm0
				; X32-SSE2-NEXT: paddb %xmm2, %xmm0
				; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
				; X32-SSE2-NEXT: psrlw $4, %xmm1
				; X32-SSE2-NEXT: paddb %xmm0, %xmm1
				; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm1
				; X32-SSE2-NEXT: pxor %xmm0, %xmm0
				; X32-SSE2-NEXT: psadbw %xmm1, %xmm0
				; X32-SSE2-NEXT: movd %xmm0, %eax
				; X32-SSE2-NEXT: xorl %edx, %edx
				; X32-SSE2-NEXT: retl
				;
				; X32-SSSE3-LABEL: cnt64:
				; X32-SSSE3: # %bb.0:
				; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
				; X32-SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
				; X32-SSSE3-NEXT: movdqa %xmm1, %xmm2
				; X32-SSSE3-NEXT: pand %xmm0, %xmm2
				; X32-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
				; X32-SSSE3-NEXT: movdqa %xmm3, %xmm4
				; X32-SSSE3-NEXT: pshufb %xmm2, %xmm4
				; X32-SSSE3-NEXT: psrlw $4, %xmm1
				; X32-SSSE3-NEXT: pand %xmm0, %xmm1
				; X32-SSSE3-NEXT: pshufb %xmm1, %xmm3
				; X32-SSSE3-NEXT: paddb %xmm4, %xmm3
				; X32-SSSE3-NEXT: pxor %xmm0, %xmm0
				; X32-SSSE3-NEXT: psadbw %xmm3, %xmm0
				; X32-SSSE3-NEXT: movd %xmm0, %eax
				; X32-SSSE3-NEXT: xorl %edx, %edx
				; X32-SSSE3-NEXT: retl
				%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
				ret i64 %cnt
				}

				define i64 @cnt64_noimplicitfloat(i64 %x) nounwind readnone noimplicitfloat {
				; X32-LABEL: cnt64_noimplicitfloat:
	; X32: # %bb.0:			; X32: # %bb.0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx			; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
	; X32-NEXT: movl %ecx, %edx			; X32-NEXT: movl %ecx, %edx
	; X32-NEXT: shrl %edx			; X32-NEXT: shrl %edx
	; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555			; X32-NEXT: andl $1431655765, %edx # imm = 0x55555555
	; X32-NEXT: subl %edx, %ecx			; X32-NEXT: subl %edx, %ecx
	; X32-NEXT: movl %ecx, %edx			; X32-NEXT: movl %ecx, %edx
	Show All 21 Lines
	; X32-NEXT: addl %eax, %edx			; X32-NEXT: addl %eax, %edx
	; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F			; X32-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
	; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101			; X32-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101
	; X32-NEXT: shrl $24, %eax			; X32-NEXT: shrl $24, %eax
	; X32-NEXT: addl %ecx, %eax			; X32-NEXT: addl %ecx, %eax
	; X32-NEXT: xorl %edx, %edx			; X32-NEXT: xorl %edx, %edx
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: cnt64:			; X64-LABEL: cnt64_noimplicitfloat:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: movq %rdi, %rax			; X64-NEXT: movq %rdi, %rax
	; X64-NEXT: shrq %rax			; X64-NEXT: shrq %rax
	; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555			; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
	; X64-NEXT: andq %rax, %rcx			; X64-NEXT: andq %rax, %rcx
	; X64-NEXT: subq %rcx, %rdi			; X64-NEXT: subq %rcx, %rdi
	; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333			; X64-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
	; X64-NEXT: movq %rdi, %rcx			; X64-NEXT: movq %rdi, %rcx
	; X64-NEXT: andq %rax, %rcx			; X64-NEXT: andq %rax, %rcx
	; X64-NEXT: shrq $2, %rdi			; X64-NEXT: shrq $2, %rdi
	; X64-NEXT: andq %rax, %rdi			; X64-NEXT: andq %rax, %rdi
	; X64-NEXT: addq %rcx, %rdi			; X64-NEXT: addq %rcx, %rdi
	; X64-NEXT: movq %rdi, %rax			; X64-NEXT: movq %rdi, %rax
	; X64-NEXT: shrq $4, %rax			; X64-NEXT: shrq $4, %rax
	; X64-NEXT: leaq (%rax,%rdi), %rax			; X64-NEXT: leaq (%rax,%rdi), %rax
	; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F			; X64-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
	; X64-NEXT: andq %rax, %rcx			; X64-NEXT: andq %rax, %rcx
	; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101			; X64-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
	; X64-NEXT: imulq %rcx, %rax			; X64-NEXT: imulq %rcx, %rax
	; X64-NEXT: shrq $56, %rax			; X64-NEXT: shrq $56, %rax
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X32-POPCNT-LABEL: cnt64:			; X32-POPCNT-LABEL: cnt64_noimplicitfloat:
	; X32-POPCNT: # %bb.0:			; X32-POPCNT: # %bb.0:
	; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx			; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
	; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax			; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
	; X32-POPCNT-NEXT: addl %ecx, %eax			; X32-POPCNT-NEXT: addl %ecx, %eax
	; X32-POPCNT-NEXT: xorl %edx, %edx			; X32-POPCNT-NEXT: xorl %edx, %edx
	; X32-POPCNT-NEXT: retl			; X32-POPCNT-NEXT: retl
	;			;
	; X64-POPCNT-LABEL: cnt64:			; X64-POPCNT-LABEL: cnt64_noimplicitfloat:
	; X64-POPCNT: # %bb.0:			; X64-POPCNT: # %bb.0:
	; X64-POPCNT-NEXT: popcntq %rdi, %rax			; X64-POPCNT-NEXT: popcntq %rdi, %rax
	; X64-POPCNT-NEXT: retq			; X64-POPCNT-NEXT: retq
	%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)			%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
	ret i64 %cnt			ret i64 %cnt
	}			}

	declare i8 @llvm.ctpop.i8(i8) nounwind readnone			declare i8 @llvm.ctpop.i8(i8) nounwind readnone
	declare i16 @llvm.ctpop.i16(i16) nounwind readnone			declare i16 @llvm.ctpop.i16(i16) nounwind readnone
	declare i32 @llvm.ctpop.i32(i32) nounwind readnone			declare i32 @llvm.ctpop.i32(i32) nounwind readnone
	declare i64 @llvm.ctpop.i64(i64) nounwind readnone			declare i64 @llvm.ctpop.i64(i64) nounwind readnone