This is an archive of the discontinued LLVM Phabricator instance.

[X86] Prefer to form negate instructions instead of folding a load
Changes PlannedPublic

Authored by craig.topper on Jun 29 2018, 11:35 AM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon

Summary

Previously when given (sub 0, load), we were prefering to fold the load and materializing the 0 in a register. I think we should instead use negate and do the load as a separate instruction.

Diff Detail

Event Timeline

craig.topper created this revision.Jun 29 2018, 11:35 AM

Have you got any numbers to back this up?

I'll run our benchmark list. This was more of an observation that we were different than icc, gcc, and msvc.

There's a related question. Given the option of promoting a (i16 sub 0, load) to 32-bits, should we promote and use neg+movzwl or keep it as 16-bits so we can fold the load.

craig.topper planned changes to this revision.Jun 29 2018, 2:17 PM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelDAGToDAG.cpp

6 lines

test/

CodeGen/

X86/

2008-09-11-CoalescerBug2.ll

4 lines

2010-08-04-MaskedSignedCompare.ll

4 lines

add.ll

8 lines

imul.ll

8 lines

merge-consecutive-stores.ll

7 lines

subcarry.ll

12 lines

Diff 153547

lib/Target/X86/X86ISelDAGToDAG.cpp

Show First 20 Lines • Show All 516 Lines • ▼ Show 20 Lines	if (U == Root) {
switch (U->getOpcode()) {		switch (U->getOpcode()) {
default: break;		default: break;
case X86ISD::ADD:		case X86ISD::ADD:
case X86ISD::SUB:		case X86ISD::SUB:
case X86ISD::AND:		case X86ISD::AND:
case X86ISD::XOR:		case X86ISD::XOR:
case X86ISD::OR:		case X86ISD::OR:
case ISD::ADD:		case ISD::ADD:
		case ISD::SUB:
case ISD::ADDCARRY:		case ISD::ADDCARRY:
case ISD::AND:		case ISD::AND:
case ISD::OR:		case ISD::OR:
case ISD::XOR: {		case ISD::XOR: {
SDValue Op1 = U->getOperand(1);		SDValue Op1 = U->getOperand(1);

// If the other operand is a 8-bit immediate we should fold the immediate		// If the other operand is a 8-bit immediate we should fold the immediate
// instead. This reduces code size.		// instead. This reduces code size.
Show All 31 Lines	case ISD::XOR: {
// a load.		// a load.
// FIXME: This is probably also true for non-TLS addresses.		// FIXME: This is probably also true for non-TLS addresses.
if (Op1.getOpcode() == X86ISD::Wrapper) {		if (Op1.getOpcode() == X86ISD::Wrapper) {
SDValue Val = Op1.getOperand(0);		SDValue Val = Op1.getOperand(0);
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)		if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
return false;		return false;
}		}

		// Prefer NEG over folding the load.
		if ((U->getOpcode() == ISD::SUB \|\| U->getOpcode() == X86ISD::SUB) &&
		isNullConstant(U->getOperand(0)))
		return false;

// Don't fold load if this matches the BTS/BTR/BTC patterns.		// Don't fold load if this matches the BTS/BTR/BTC patterns.
// BTS: (or X, (shl 1, n))		// BTS: (or X, (shl 1, n))
// BTR: (and X, (rotl -2, n))		// BTR: (and X, (rotl -2, n))
// BTC: (xor X, (shl 1, n))		// BTC: (xor X, (shl 1, n))
if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {		if (U->getOpcode() == ISD::OR \|\| U->getOpcode() == ISD::XOR) {
if (U->getOperand(0).getOpcode() == ISD::SHL &&		if (U->getOperand(0).getOpcode() == ISD::SHL &&
isOneConstant(U->getOperand(0).getOperand(0)))		isOneConstant(U->getOperand(0).getOperand(0)))
return false;		return false;
▲ Show 20 Lines • Show All 2,834 Lines • Show Last 20 Lines

test/CodeGen/X86/2008-09-11-CoalescerBug2.ll

	; RUN: llc < %s -mtriple=i686--			; RUN: llc < %s -mtriple=i686--
	; RUN: llc -pre-RA-sched=source < %s -mtriple=i686-unknown-linux -mcpu=corei7 \| FileCheck %s --check-prefix=SOURCE-SCHED			; RUN: llc -pre-RA-sched=source < %s -mtriple=i686-unknown-linux -mcpu=corei7 \| FileCheck %s --check-prefix=SOURCE-SCHED
	; PR2748			; PR2748

	@g_73 = external global i32 ; <i32*> [#uses=1]			@g_73 = external global i32 ; <i32*> [#uses=1]
	@g_5 = external global i32 ; <i32*> [#uses=1]			@g_5 = external global i32 ; <i32*> [#uses=1]

	define i32 @func_44(i16 signext %p_46) nounwind {			define i32 @func_44(i16 signext %p_46) nounwind {
	entry:			entry:
	; SOURCE-SCHED: subl			; SOURCE-SCHED: subl
	; SOURCE-SCHED: movl			; SOURCE-SCHED: movl
	; SOURCE-SCHED: sarl			; SOURCE-SCHED: sarl
	; SOURCE-SCHED: xorl			; SOURCE-SCHED: xorl
	; SOURCE-SCHED: cmpl			; SOURCE-SCHED: cmpl
	; SOURCE-SCHED: setg			; SOURCE-SCHED: setg
	; SOURCE-SCHED: movb			; SOURCE-SCHED: movb
	; SOURCE-SCHED: xorl			; SOURCE-SCHED: movl
	; SOURCE-SCHED: subl			; SOURCE-SCHED: negl
	; SOURCE-SCHED: testb			; SOURCE-SCHED: testb
	; SOURCE-SCHED: jne			; SOURCE-SCHED: jne
	%0 = load i32, i32* @g_5, align 4 ; <i32> [#uses=1]			%0 = load i32, i32* @g_5, align 4 ; <i32> [#uses=1]
	%1 = ashr i32 %0, 1 ; <i32> [#uses=1]			%1 = ashr i32 %0, 1 ; <i32> [#uses=1]
	%2 = icmp sgt i32 %1, 1 ; <i1> [#uses=1]			%2 = icmp sgt i32 %1, 1 ; <i1> [#uses=1]
	%3 = zext i1 %2 to i32 ; <i32> [#uses=1]			%3 = zext i1 %2 to i32 ; <i32> [#uses=1]
	%4 = load i32, i32* @g_73, align 4 ; <i32> [#uses=1]			%4 = load i32, i32* @g_73, align 4 ; <i32> [#uses=1]
	%5 = zext i16 %p_46 to i64 ; <i64> [#uses=1]			%5 = zext i16 %p_46 to i64 ; <i64> [#uses=1]
	Show All 19 Lines

test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s
	; PR7814			; PR7814

	@g_16 = global i64 -3738643449681751625, align 8			@g_16 = global i64 -3738643449681751625, align 8
	@g_38 = global i32 0, align 4			@g_38 = global i32 0, align 4
	@.str = private constant [4 x i8] c"%d\0A\00"			@.str = private constant [4 x i8] c"%d\0A\00"

	define i32 @main() nounwind {			define i32 @main() nounwind {
	; CHECK-LABEL: main:			; CHECK-LABEL: main:
	; CHECK: # %bb.0: # %entry			; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: xorl %eax, %eax			; CHECK-NEXT: movq {{.*}}(%rip), %rax
	; CHECK-NEXT: cmpq {{.*}}(%rip), %rax			; CHECK-NEXT: negq %rax
	; CHECK-NEXT: sbbl %eax, %eax			; CHECK-NEXT: sbbl %eax, %eax
	; CHECK-NEXT: andl $150, %eax			; CHECK-NEXT: andl $150, %eax
	; CHECK-NEXT: testb %al, %al			; CHECK-NEXT: testb %al, %al
	; CHECK-NEXT: jle .LBB0_1			; CHECK-NEXT: jle .LBB0_1
	; CHECK-NEXT: # %bb.2: # %if.then			; CHECK-NEXT: # %bb.2: # %if.then
	; CHECK-NEXT: movl $1, {{.*}}(%rip)			; CHECK-NEXT: movl $1, {{.*}}(%rip)
	; CHECK-NEXT: movl $1, %esi			; CHECK-NEXT: movl $1, %esi
	; CHECK-NEXT: jmp .LBB0_3			; CHECK-NEXT: jmp .LBB0_3
	Show All 35 Lines

test/CodeGen/X86/add.ll

Show First 20 Lines • Show All 380 Lines • ▼ Show 20 Lines	entry:
%b = add i64 %aa, 128		%b = add i64 %aa, 128
store i64 %b, i64* %a		store i64 %b, i64* %a
ret void		ret void
}		}

define i32 @inc_not(i32 %a) {		define i32 @inc_not(i32 %a) {
; X32-LABEL: inc_not:		; X32-LABEL: inc_not:
; X32: # %bb.0:		; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax		; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax		; X32-NEXT: negl %eax
; X32-NEXT: retl		; X32-NEXT: retl
;		;
; X64-LINUX-LABEL: inc_not:		; X64-LINUX-LABEL: inc_not:
; X64-LINUX: # %bb.0:		; X64-LINUX: # %bb.0:
; X64-LINUX-NEXT: negl %edi		; X64-LINUX-NEXT: negl %edi
; X64-LINUX-NEXT: movl %edi, %eax		; X64-LINUX-NEXT: movl %edi, %eax
; X64-LINUX-NEXT: retq		; X64-LINUX-NEXT: retq
;		;
; X64-WIN32-LABEL: inc_not:		; X64-WIN32-LABEL: inc_not:
; X64-WIN32: # %bb.0:		; X64-WIN32: # %bb.0:
; X64-WIN32-NEXT: negl %ecx		; X64-WIN32-NEXT: negl %ecx
; X64-WIN32-NEXT: movl %ecx, %eax		; X64-WIN32-NEXT: movl %ecx, %eax
; X64-WIN32-NEXT: retq		; X64-WIN32-NEXT: retq
%nota = xor i32 %a, -1		%nota = xor i32 %a, -1
%r = add i32 %nota, 1		%r = add i32 %nota, 1
ret i32 %r		ret i32 %r
}		}

define void @uaddo1_not(i32 %a, i32* %p0, i1* %p1) {		define void @uaddo1_not(i32 %a, i32* %p0, i1* %p1) {
; X32-LABEL: uaddo1_not:		; X32-LABEL: uaddo1_not:
; X32: # %bb.0:		; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx		; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: xorl %edx, %edx		; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: subl {{[0-9]+}}(%esp), %edx		; X32-NEXT: negl %edx
; X32-NEXT: movl %edx, (%ecx)		; X32-NEXT: movl %edx, (%ecx)
; X32-NEXT: setae (%eax)		; X32-NEXT: setae (%eax)
; X32-NEXT: retl		; X32-NEXT: retl
;		;
; X64-LINUX-LABEL: uaddo1_not:		; X64-LINUX-LABEL: uaddo1_not:
; X64-LINUX: # %bb.0:		; X64-LINUX: # %bb.0:
; X64-LINUX-NEXT: negl %edi		; X64-LINUX-NEXT: negl %edi
; X64-LINUX-NEXT: movl %edi, (%rsi)		; X64-LINUX-NEXT: movl %edi, (%rsi)
Show All 17 Lines

test/CodeGen/X86/imul.ll

	Show First 20 Lines • Show All 268 Lines • ▼ Show 20 Lines
	; X64-LABEL: mul4294967295_32:			; X64-LABEL: mul4294967295_32:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: negl %edi			; X64-NEXT: negl %edi
	; X64-NEXT: movl %edi, %eax			; X64-NEXT: movl %edi, %eax
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: mul4294967295_32:			; X86-LABEL: mul4294967295_32:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: xorl %eax, %eax			; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: subl {{[0-9]+}}(%esp), %eax			; X86-NEXT: negl %eax
	; X86-NEXT: retl			; X86-NEXT: retl
	%mul = mul i32 %A, 4294967295			%mul = mul i32 %A, 4294967295
	ret i32 %mul			ret i32 %mul
	}			}

	define i64 @mul18446744073709551615_64(i64 %A) {			define i64 @mul18446744073709551615_64(i64 %A) {
	; X64-LABEL: mul18446744073709551615_64:			; X64-LABEL: mul18446744073709551615_64:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: negq %rdi			; X64-NEXT: negq %rdi
	; X64-NEXT: movq %rdi, %rax			; X64-NEXT: movq %rdi, %rax
	; X64-NEXT: retq			; X64-NEXT: retq
	;			;
	; X86-LABEL: mul18446744073709551615_64:			; X86-LABEL: mul18446744073709551615_64:
	; X86: # %bb.0:			; X86: # %bb.0:
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: xorl %edx, %edx			; X86-NEXT: xorl %edx, %edx
	; X86-NEXT: xorl %eax, %eax			; X86-NEXT: negl %eax
	; X86-NEXT: subl {{[0-9]+}}(%esp), %eax
	; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx			; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx
	; X86-NEXT: retl			; X86-NEXT: retl
	%mul = mul i64 %A, 18446744073709551615			%mul = mul i64 %A, 18446744073709551615
	ret i64 %mul			ret i64 %mul
	}			}

	define i32 @test(i32 %a) {			define i32 @test(i32 %a) {
	; X64-LABEL: test:			; X64-LABEL: test:
	▲ Show 20 Lines • Show All 224 Lines • Show Last 20 Lines

test/CodeGen/X86/merge-consecutive-stores.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown-unknown \| FileCheck %s			; RUN: llc < %s -mtriple=i686-unknown-unknown \| FileCheck %s

	; Make sure that we are zeroing one memory location at a time using xorl and			; Make sure that we are zeroing one memory location at a time using xorl and
	; not both using XMM registers.			; not both using XMM registers.

	define i32 @foo (i64* %so) nounwind uwtable ssp {			define i32 @foo (i64* %so) nounwind uwtable ssp {
	; CHECK-LABEL: foo:			; CHECK-LABEL: foo:
	; CHECK: # %bb.0:			; CHECK: # %bb.0:
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax			; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
	; CHECK-NEXT: movl $0, 28(%eax)			; CHECK-NEXT: movl $0, 28(%eax)
	; CHECK-NEXT: movl $0, 24(%eax)			; CHECK-NEXT: movl $0, 24(%eax)
	; CHECK-NEXT: xorl %ecx, %ecx			; CHECK-NEXT: movl 16(%eax), %ecx
	; CHECK-NEXT: cmpl 16(%eax), %ecx
	; CHECK-NEXT: movl $0, 16(%eax)			; CHECK-NEXT: movl $0, 16(%eax)
	; CHECK-NEXT: sbbl 20(%eax), %ecx			; CHECK-NEXT: xorl %edx, %edx
				; CHECK-NEXT: negl %ecx
				; CHECK-NEXT: sbbl 20(%eax), %edx
	; CHECK-NEXT: movl $0, 20(%eax)			; CHECK-NEXT: movl $0, 20(%eax)
	; CHECK-NEXT: setl %al			; CHECK-NEXT: setl %al
	; CHECK-NEXT: movzbl %al, %eax			; CHECK-NEXT: movzbl %al, %eax
	; CHECK-NEXT: negl %eax			; CHECK-NEXT: negl %eax
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%used = getelementptr inbounds i64, i64* %so, i32 3			%used = getelementptr inbounds i64, i64* %so, i32 3
	store i64 0, i64* %used, align 8			store i64 0, i64* %used, align 8
	%fill = getelementptr inbounds i64, i64* %so, i32 2			%fill = getelementptr inbounds i64, i64* %so, i32 2
	%L = load i64, i64* %fill, align 8			%L = load i64, i64* %fill, align 8
	store i64 0, i64* %fill, align 8			store i64 0, i64* %fill, align 8
	%cmp28 = icmp sgt i64 %L, 0			%cmp28 = icmp sgt i64 %L, 0
	%R = sext i1 %cmp28 to i32			%R = sext i1 %cmp28 to i32
	ret i32 %R			ret i32 %R
	}			}

test/CodeGen/X86/subcarry.ll

Show All 31 Lines	entry:
ret i256 %0		ret i256 %0
}		}

%S = type { [4 x i64] }		%S = type { [4 x i64] }

define %S @negate(%S* nocapture readonly %this) {		define %S @negate(%S* nocapture readonly %this) {
; CHECK-LABEL: negate:		; CHECK-LABEL: negate:
; CHECK: # %bb.0: # %entry		; CHECK: # %bb.0: # %entry
		; CHECK-NEXT: movq (%rsi), %rax
; CHECK-NEXT: xorl %r8d, %r8d		; CHECK-NEXT: xorl %r8d, %r8d
; CHECK-NEXT: xorl %ecx, %ecx		; CHECK-NEXT: negq %rax
; CHECK-NEXT: subq (%rsi), %rcx
; CHECK-NEXT: movl $0, %edx		; CHECK-NEXT: movl $0, %edx
; CHECK-NEXT: sbbq 8(%rsi), %rdx		; CHECK-NEXT: sbbq 8(%rsi), %rdx
; CHECK-NEXT: movl $0, %eax		; CHECK-NEXT: movl $0, %ecx
; CHECK-NEXT: sbbq 16(%rsi), %rax		; CHECK-NEXT: sbbq 16(%rsi), %rcx
; CHECK-NEXT: sbbq 24(%rsi), %r8		; CHECK-NEXT: sbbq 24(%rsi), %r8
; CHECK-NEXT: movq %rcx, (%rdi)		; CHECK-NEXT: movq %rax, (%rdi)
; CHECK-NEXT: movq %rdx, 8(%rdi)		; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rax, 16(%rdi)		; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: movq %r8, 24(%rdi)		; CHECK-NEXT: movq %r8, 24(%rdi)
; CHECK-NEXT: movq %rdi, %rax		; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: retq		; CHECK-NEXT: retq
entry:		entry:
%0 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 0		%0 = getelementptr inbounds %S, %S* %this, i64 0, i32 0, i64 0
%1 = load i64, i64* %0, align 8		%1 = load i64, i64* %0, align 8
%2 = xor i64 %1, -1		%2 = xor i64 %1, -1
%3 = zext i64 %2 to i128		%3 = zext i64 %2 to i128
▲ Show 20 Lines • Show All 106 Lines • Show Last 20 Lines