This is an archive of the discontinued LLVM Phabricator instance.

[X86]: Quit promoting 8 and 16 bit compares to 32 bit.
ClosedPublic

Authored by kbsmith1 on Jun 8 2016, 10:48 AM.

Download Raw Diff

Details

Reviewers

grosbach
echristo
sanjoy
DavidKreitzer

Commits

rL272801: [X86]: Quit promoting 8 and 16 bit compares to 32 bit.

Summary

This change effectively just reverts r195496, and updates the tests as needed.
8 and 16 bit compares no longer promoted up into 32 bit compares. This has some
nice performance improvements, especially in eembc/rgbcmykv2. In order for this
not to cause performance regressions in 401.bzip2, changes http://reviews.llvm.org/D21085
are also necessary to get all the necessary movb and movw instructions promoted to
movzbl/movzwl.

Diff Detail

Event Timeline

kbsmith1 updated this revision to Diff 60068.Jun 8 2016, 10:48 AM

kbsmith1 retitled this revision from to [X86]: Quit promoting 8 and 16 bit compares to 32 bit..

kbsmith1 updated this object.

kbsmith1 added reviewers: echristo, DavidKreitzer, sanjoy.

kbsmith1 added a subscriber: llvm-commits.

Herald added a subscriber: mehdi_amini. · View Herald TranscriptJun 8 2016, 10:48 AM

We talked about this on the list, but getting an explicit ack from Jim.

-eric

echristo accepted this revision.Jun 8 2016, 1:27 PM

echristo edited edge metadata.

This revision is now accepted and ready to land.Jun 8 2016, 1:27 PM

eli.friedman added a subscriber: eli.friedman.Jun 8 2016, 2:20 PM

eli.friedman added inline comments.

test/CodeGen/X86/memcmp.ll
44	16-bit immediate operands are bad for performance on modern x86.

kbsmith1 added inline comments.Jun 8 2016, 3:48 PM

test/CodeGen/X86/memcmp.ll
44	I'm looking into changing the code so that 16 bit compares which have a constant operand will continue to get promoted, and how that affects the performance numbers.

Updated changes so this will continue to promote 16 bit compares to 32 bits if one of the compare
operands is a constant. This addresses Eli Friedman's comment.

Closed by commit rL272801: [X86]: Quit promoting 8 and 16 bit compares to 32 bit. (authored by kbsmith1). · Explain WhyJun 15 2016, 9:44 AM

This revision was automatically updated to reflect the committed changes.

In D21144#453975, @kbsmith1 wrote:

Updated changes so this will continue to promote 16 bit compares to 32 bits if one of the compare
operands is a constant. This addresses Eli Friedman's comment.

Yes, 16bit immediate constant may introduce LCP that may end up hurting performance. But on newer architectures (sandybridge and later), this problem is much less severe, especially when the loop body fits in LSD. I don't think it's a good idea to blindly convert all 16bit immediate constant comparison to 32bit.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

32 lines

test/

CodeGen/

X86/

8 lines

56 lines

48 lines

2 lines

machine-sink-and-implicit-null-checks.ll

2 lines

memcmp.ll

6 lines

pr5145.ll

16 lines

x86-shrink-wrapping.ll

3 lines

Diff 60068

lib/Target/X86/X86ISelLowering.cpp

Context not available.
	}	}
	}	}

	/// \brief Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");	default: llvm_unreachable("Invalid integer condition!");
Context not available.

	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
	// Do the comparison at i32 if it's smaller, besides the Atom case.
	// This avoids subregister aliasing issues. Keep the smaller reference
	// if we're optimizing for size, however, as that'll allow better folding
	// of memory operations.
	if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
	!DAG.getMachineFunction().getFunction()->optForMinSize() &&
	!Subtarget.isAtom()) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
Context not available.

test/CodeGen/X86/3addr-16bit.ll

Context not available.

	; 64BIT-LABEL: t1:	; 64BIT-LABEL: t1:
	; 64BIT-NOT: movw %si, %ax	; 64BIT-NOT: movw %si, %ax
	; 64BIT: leal 1(%rsi), %eax	; 64BIT: movl %esi, %eax
	%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]	%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
	%1 = add i16 %k, 1 ; <i16> [#uses=3]	%1 = add i16 %k, 1 ; <i16> [#uses=3]
	br i1 %0, label %bb, label %bb1	br i1 %0, label %bb, label %bb1
Context not available.

	; 64BIT-LABEL: t2:	; 64BIT-LABEL: t2:
	; 64BIT-NOT: movw %si, %ax	; 64BIT-NOT: movw %si, %ax
	; 64BIT: leal -1(%rsi), %eax	; 64BIT: movl %esi, %eax
	; 64BIT: movzwl %ax	; 64BIT: movzwl %ax
	%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]	%0 = icmp eq i16 %k, %c ; <i1> [#uses=1]
	%1 = add i16 %k, -1 ; <i16> [#uses=3]	%1 = add i16 %k, -1 ; <i16> [#uses=3]
Context not available.

	; 64BIT-LABEL: t3:	; 64BIT-LABEL: t3:
	; 64BIT-NOT: movw %si, %ax	; 64BIT-NOT: movw %si, %ax
	; 64BIT: leal 2(%rsi), %eax	; 64BIT: movl %esi, %eax
	%0 = add i16 %k, 2 ; <i16> [#uses=3]	%0 = add i16 %k, 2 ; <i16> [#uses=3]
	%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]	%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
	br i1 %1, label %bb, label %bb1	br i1 %1, label %bb, label %bb1
Context not available.

	; 64BIT-LABEL: t4:	; 64BIT-LABEL: t4:
	; 64BIT-NOT: movw %si, %ax	; 64BIT-NOT: movw %si, %ax
	; 64BIT: leal (%rsi,%rdi), %eax	; 64BIT: movl %esi, %eax
	%0 = add i16 %k, %c ; <i16> [#uses=3]	%0 = add i16 %k, %c ; <i16> [#uses=3]
	%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]	%1 = icmp eq i16 %k, %c ; <i1> [#uses=1]
	br i1 %1, label %bb, label %bb1	br i1 %1, label %bb, label %bb1
Context not available.

test/CodeGen/X86/atomic16.ll

Context not available.
	}	}

	define void @atomic_fetch_max16(i16 %x) nounwind {	define void @atomic_fetch_max16(i16 %x) nounwind {
		; X64-LABEL: atomic_fetch_max16
		; X32-LABEL: atomic_fetch_max16
	%t1 = atomicrmw max i16* @sc16, i16 %x acquire	%t1 = atomicrmw max i16* @sc16, i16 %x acquire
	; X64: movswl	; X64: movw
	; X64: movswl	; X64: movw
	; X64: subl	; X64: subw
	; X64: cmov	; X64: cmov
	; X64: lock	; X64: lock
	; X64: cmpxchgw	; X64: cmpxchgw

	; X32: movswl	; X32: movw
	; X32: movswl	; X32: movw
	; X32: subl	; X32: subw
	; X32: cmov	; X32: cmov
	; X32: lock	; X32: lock
	; X32: cmpxchgw	; X32: cmpxchgw
Context not available.
	}	}

	define void @atomic_fetch_min16(i16 %x) nounwind {	define void @atomic_fetch_min16(i16 %x) nounwind {
		; X64-LABEL: atomic_fetch_min16
		; X32-LABEL: atomic_fetch_min16
	%t1 = atomicrmw min i16* @sc16, i16 %x acquire	%t1 = atomicrmw min i16* @sc16, i16 %x acquire
	; X64: movswl	; X64: movw
	; X64: movswl	; X64: movw
	; X64: subl	; X64: subw
	; X64: cmov	; X64: cmov
	; X64: lock	; X64: lock
	; X64: cmpxchgw	; X64: cmpxchgw

	; X32: movswl	; X32: movw
	; X32: movswl	; X32: movw
	; X32: subl	; X32: subw
	; X32: cmov	; X32: cmov
	; X32: lock	; X32: lock
	; X32: cmpxchgw	; X32: cmpxchgw
Context not available.
	}	}

	define void @atomic_fetch_umax16(i16 %x) nounwind {	define void @atomic_fetch_umax16(i16 %x) nounwind {
		; X64-LABEL: atomic_fetch_umax16
		; X32-LABEL: atomic_fetch_umax16
	%t1 = atomicrmw umax i16* @sc16, i16 %x acquire	%t1 = atomicrmw umax i16* @sc16, i16 %x acquire
	; X64: movzwl	; X64: movw
	; X64: movzwl	; X64: movw
	; X64: subl	; X64: subw
	; X64: cmov	; X64: cmov
	; X64: lock	; X64: lock
	; X64: cmpxchgw	; X64: cmpxchgw

	; X32: movzwl	; X32: movw
	; X32: movzwl	; X32: movw
	; X32: subl	; X32: subw
	; X32: cmov	; X32: cmov
	; X32: lock	; X32: lock
	; X32: cmpxchgw	; X32: cmpxchgw
Context not available.
	}	}

	define void @atomic_fetch_umin16(i16 %x) nounwind {	define void @atomic_fetch_umin16(i16 %x) nounwind {
		; X64-LABEL: atomic_fetch_umin16
		; X32-LABEL: atomic_fetch_umin16
	%t1 = atomicrmw umin i16* @sc16, i16 %x acquire	%t1 = atomicrmw umin i16* @sc16, i16 %x acquire
	; X64: movzwl	; X64: movw
	; X64: movzwl	; X64: movw
	; X64: subl	; X64: subw
	; X64: cmov	; X64: cmov
	; X64: lock	; X64: lock
	; X64: cmpxchgw	; X64: cmpxchgw

	; X32: movzwl	; X32: movw
	; X32: movzwl	; X32: movw
	; X32: subl	; X32: subw
	; X32: cmov	; X32: cmov
	; X32: lock	; X32: lock
	; X32: cmpxchgw	; X32: cmpxchgw
Context not available.

test/CodeGen/X86/atomic8.ll

Context not available.
	; X64-LABEL: atomic_fetch_max8:	; X64-LABEL: atomic_fetch_max8:
	; X32-LABEL: atomic_fetch_max8:	; X32-LABEL: atomic_fetch_max8:
	%t1 = atomicrmw max i8* @sc8, i8 %x acquire	%t1 = atomicrmw max i8* @sc8, i8 %x acquire
	; X64: movsbl	; X64: movb
	; X64: movsbl	; X64: movb
	; X64: subl	; X64: subb
	; X64: lock	; X64: lock
	; X64: cmpxchgb	; X64: cmpxchgb

	; X32: movsbl	; X32: movb
	; X32: movsbl	; X32: movb
	; X32: subl	; X32: subb
	; X32: lock	; X32: lock
	; X32: cmpxchgb	; X32: cmpxchgb
	ret void	ret void
Context not available.
	; X64-LABEL: atomic_fetch_min8:	; X64-LABEL: atomic_fetch_min8:
	; X32-LABEL: atomic_fetch_min8:	; X32-LABEL: atomic_fetch_min8:
	%t1 = atomicrmw min i8* @sc8, i8 %x acquire	%t1 = atomicrmw min i8* @sc8, i8 %x acquire
	; X64: movsbl	; X64: movb
	; X64: movsbl	; X64: movb
	; X64: subl	; X64: subb
	; X64: lock	; X64: lock
	; X64: cmpxchgb	; X64: cmpxchgb

	; X32: movsbl	; X32: movb
	; X32: movsbl	; X32: movb
	; X32: subl	; X32: subb
	; X32: lock	; X32: lock
	; X32: cmpxchgb	; X32: cmpxchgb
	ret void	ret void
Context not available.
	; X64-LABEL: atomic_fetch_umax8:	; X64-LABEL: atomic_fetch_umax8:
	; X32-LABEL: atomic_fetch_umax8:	; X32-LABEL: atomic_fetch_umax8:
	%t1 = atomicrmw umax i8* @sc8, i8 %x acquire	%t1 = atomicrmw umax i8* @sc8, i8 %x acquire
	; X64: movzbl	; X64: movb
	; X64: movzbl	; X64: movb
	; X64: subl	; X64: subb
	; X64: lock	; X64: lock
	; X64: cmpxchgb	; X64: cmpxchgb

	; X32: movzbl	; X32: movb
	; X32: movzbl	; X32: movb
	; X32: subl	; X32: subb
	; X32: lock	; X32: lock
	; X32: cmpxchgb	; X32: cmpxchgb
	ret void	ret void
Context not available.
	; X64-LABEL: atomic_fetch_umin8:	; X64-LABEL: atomic_fetch_umin8:
	; X32-LABEL: atomic_fetch_umin8:	; X32-LABEL: atomic_fetch_umin8:
	%t1 = atomicrmw umin i8* @sc8, i8 %x acquire	%t1 = atomicrmw umin i8* @sc8, i8 %x acquire
	; X64: movzbl	; X64: movb
	; X64: movzbl	; X64: movb
	; X64: subl	; X64: subb
	; X64: lock	; X64: lock
	; X64: cmpxchgb	; X64: cmpxchgb

	; X32: movzbl	; X32: movb
	; X32: movzbl	; X32: movb
	; X32: subl	; X32: subb
	; X32: lock	; X32: lock
	; X32: cmpxchgb	; X32: cmpxchgb
	ret void	ret void
Context not available.

test/CodeGen/X86/ctpop-combine.ll

Context not available.
	%conv = zext i1 %cmp to i32	%conv = zext i1 %cmp to i32
	ret i32 %conv	ret i32 %conv
	; CHECK-LABEL: test3:	; CHECK-LABEL: test3:
	; CHECK: cmpl $2	; CHECK: cmpb $2
	; CHECK: ret	; CHECK: ret
	}	}
Context not available.

test/CodeGen/X86/machine-sink-and-implicit-null-checks.ll

Context not available.
	; CHECK-NEXT: .byte 1	; CHECK-NEXT: .byte 1
	; CHECK-NEXT: .byte 0	; CHECK-NEXT: .byte 0
	; CHECK-NEXT: .short 0	; CHECK-NEXT: .short 0
	; CHECK-NEXT: .long 2	; CHECK-NEXT: .long 1

	; FunctionInfo[0] =	; FunctionInfo[0] =

Context not available.

test/CodeGen/X86/memcmp.ll

Context not available.
	ret void	ret void
	; CHECK-LABEL: memcmp2:	; CHECK-LABEL: memcmp2:
	; CHECK: movzwl	; CHECK: movzwl
	; CHECK-NEXT: movzwl	; CHECK-NEXT: cmpw
	; CHECK-NEXT: cmpl
	; NOBUILTIN-LABEL: memcmp2:	; NOBUILTIN-LABEL: memcmp2:
	; NOBUILTIN: callq	; NOBUILTIN: callq
	}	}
Context not available.
	return: ; preds = %entry	return: ; preds = %entry
	ret void	ret void
	; CHECK-LABEL: memcmp2a:	; CHECK-LABEL: memcmp2a:
	; CHECK: movzwl	; CHECK: cmpw $28527, (%
		eli.friedmanUnsubmitted Not Done Reply Inline Actions 16-bit immediate operands are bad for performance on modern x86. eli.friedman: 16-bit immediate operands are bad for performance on modern x86.
		kbsmith1AuthorUnsubmitted Not Done Reply Inline Actions I'm looking into changing the code so that 16 bit compares which have a constant operand will continue to get promoted, and how that affects the performance numbers. kbsmith1: I'm looking into changing the code so that 16 bit compares which have a constant operand will…
	; CHECK-NEXT: cmpl $28527,
	}	}


Context not available.

test/CodeGen/X86/pr5145.ll

Context not available.
	; CHECK: atomic_maxmin_i8	; CHECK: atomic_maxmin_i8
	%1 = atomicrmw max i8* @sc8, i8 5 acquire	%1 = atomicrmw max i8* @sc8, i8 5 acquire
	; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:	; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
	; CHECK: movsbl	; CHECK: cmpb
	; CHECK: cmpl	; CHECK: jg
	; CHECK: lock cmpxchgb	; CHECK: lock cmpxchgb
	; CHECK: jne [[LABEL1]]	; CHECK: jne [[LABEL1]]
	%2 = atomicrmw min i8* @sc8, i8 6 acquire	%2 = atomicrmw min i8* @sc8, i8 6 acquire
	; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:	; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
	; CHECK: movsbl	; CHECK: cmpb
	; CHECK: cmpl	; CHECK: jl
	; CHECK: lock cmpxchgb	; CHECK: lock cmpxchgb
	; CHECK: jne [[LABEL3]]	; CHECK: jne [[LABEL3]]
	%3 = atomicrmw umax i8* @sc8, i8 7 acquire	%3 = atomicrmw umax i8* @sc8, i8 7 acquire
	; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:	; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
	; CHECK: movzbl	; CHECK: cmpb
	; CHECK: cmpl	; CHECK: ja
	; CHECK: lock cmpxchgb	; CHECK: lock cmpxchgb
	; CHECK: jne [[LABEL5]]	; CHECK: jne [[LABEL5]]
	%4 = atomicrmw umin i8* @sc8, i8 8 acquire	%4 = atomicrmw umin i8* @sc8, i8 8 acquire
	; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:	; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
	; CHECK: movzbl	; CHECK: cmpb
	; CHECK: cmpl	; CHECK: jb
	; CHECK: lock cmpxchgb	; CHECK: lock cmpxchgb
	; CHECK: jne [[LABEL7]]	; CHECK: jne [[LABEL7]]
	ret void	ret void
Context not available.

test/CodeGen/X86/x86-shrink-wrapping.ll

Context not available.
	; CHECK: testq %rdi, %rdi	; CHECK: testq %rdi, %rdi
	; CHECK-NEXT: je [[CLEANUP:LBB[0-9_]+]]	; CHECK-NEXT: je [[CLEANUP:LBB[0-9_]+]]
	;	;
	; CHECK: movzwl (%rdi), [[BF_LOAD:%e[a-z]+]]	; CHECK: cmpw $66, (%rdi)
	; CHECK-NEXT: cmpl $66, [[BF_LOAD]]
	; CHECK-NEXT: jne [[CLEANUP]]	; CHECK-NEXT: jne [[CLEANUP]]
	;	;
	; CHECK: movq 8(%rdi), %rdi	; CHECK: movq 8(%rdi), %rdi
Context not available.