This is an archive of the discontinued LLVM Phabricator instance.

[X86] Limit the number of target specific nodes emitted in LowerShiftParts
ClosedPublic

Authored by craig.topper on Jun 26 2018, 5:16 PM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon

Commits

rG87b107dd698f: [X86] Limit the number of target specific nodes emitted in LowerShiftParts
rL335998: [X86] Limit the number of target specific nodes emitted in LowerShiftParts

Summary

The important part is the creation of the SHLD/SHRD nodes. The compare and the conditional move can use target independent nodes that can be legalized on their own. This gives some opportunities to trigger the optimizations present in the lowering for those things. And its just better to limit the number of places we emit target specific nodes.

The changed test cases still aren't optimal.

Diff Detail

Event Timeline

craig.topper created this revision.Jun 26 2018, 5:16 PM

RKSimon added inline comments.Jun 28 2018, 8:31 AM

test/CodeGen/X86/legalize-shift-64.ll
146	?

craig.topper added inline comments.Jun 28 2018, 12:23 PM

test/CodeGen/X86/legalize-shift-64.ll
146	I didn't look closely at it. I'm assuming we're missing constant folding on X86ISD::OR. Notice in the LHS side we do the equally dumb movb $32, %dl testb %dl, %dl jne

LGTM

test/CodeGen/X86/legalize-shift-64.ll
146	I've raised PR37987 to track this.

This revision is now accepted and ready to land.Jun 29 2018, 5:31 AM

Closed by commit rL335998: [X86] Limit the number of target specific nodes emitted in LowerShiftParts (authored by ctopper). · Explain WhyJun 29 2018, 10:28 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

19 lines

test/

CodeGen/

X86/

legalize-shift-64.ll

25 lines

pr32282.ll

39 lines

Diff 152992

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 16,122 Lines • ▼ Show 20 Lines	if (Op.getOpcode() == ISD::SHL_PARTS) {
Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);		Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
}		}

// If the shift amount is larger or equal than the width of a part we can't		// If the shift amount is larger or equal than the width of a part we can't
// rely on the results of shld/shrd. Insert a test and select the appropriate		// rely on the results of shld/shrd. Insert a test and select the appropriate
// values for large shift amounts.		// values for large shift amounts.
SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,		SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i8));		DAG.getConstant(VTBits, dl, MVT::i8));
SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,		SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
AndNode, DAG.getConstant(0, dl, MVT::i8));		DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);

SDValue Hi, Lo;		SDValue Hi, Lo;
SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

if (Op.getOpcode() == ISD::SHL_PARTS) {		if (Op.getOpcode() == ISD::SHL_PARTS) {
Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);		Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);		Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
} else {		} else {
Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);		Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);		Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
}		}

SDValue Ops[2] = { Lo, Hi };		return DAG.getMergeValues({ Lo, Hi }, dl);
return DAG.getMergeValues(Ops, dl);
}		}

// Try to use a packed vector operation to handle i64 on 32-bit targets when		// Try to use a packed vector operation to handle i64 on 32-bit targets when
// AVX512DQ is enabled.		// AVX512DQ is enabled.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,		static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|		assert((Op.getOpcode() == ISD::SINT_TO_FP \|\|
Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");		Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
▲ Show 20 Lines • Show All 24,292 Lines • Show Last 20 Lines

test/CodeGen/X86/legalize-shift-64.ll

	Show First 20 Lines • Show All 136 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: .cfi_def_cfa_offset 8			; CHECK-NEXT: .cfi_def_cfa_offset 8
	; CHECK-NEXT: .cfi_offset %ebp, -8			; CHECK-NEXT: .cfi_offset %ebp, -8
	; CHECK-NEXT: movl %esp, %ebp			; CHECK-NEXT: movl %esp, %ebp
	; CHECK-NEXT: .cfi_def_cfa_register %ebp			; CHECK-NEXT: .cfi_def_cfa_register %ebp
	; CHECK-NEXT: andl $-8, %esp			; CHECK-NEXT: andl $-8, %esp
	; CHECK-NEXT: subl $16, %esp			; CHECK-NEXT: subl $16, %esp
	; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp)			; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp)
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax			; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: xorl %eax, %eax
				; CHECK-NEXT: orl $0, %eax
				RKSimonUnsubmitted Not Done Reply Inline Actions ? RKSimon: ?
				craig.topperAuthorUnsubmitted Not Done Reply Inline Actions I didn't look closely at it. I'm assuming we're missing constant folding on X86ISD::OR. Notice in the LHS side we do the equally dumb movb $32, %dl testb %dl, %dl jne craig.topper: I didn't look closely at it. I'm assuming we're missing constant folding on X86ISD::OR. Notice…
				RKSimonUnsubmitted Not Done Reply Inline Actions I've raised PR37987 to track this. RKSimon: I've raised PR37987 to track this.
				; CHECK-NEXT: je .LBB5_3
				; CHECK-NEXT: # %bb.1: # %if.then
	; CHECK-NEXT: movl $1, %eax			; CHECK-NEXT: movl $1, %eax
	; CHECK-NEXT: xorl %ecx, %ecx			; CHECK-NEXT: jmp .LBB5_2
	; CHECK-NEXT: shldl $32, %eax, %ecx			; CHECK-NEXT: .LBB5_3: # %if.end
	; CHECK-NEXT: movb $32, %dl
	; CHECK-NEXT: testb %dl, %dl
	; CHECK-NEXT: jne .LBB5_2
	; CHECK-NEXT: # %bb.1:
	; CHECK-NEXT: movl %ecx, %eax
	; CHECK-NEXT: .LBB5_2:
	; CHECK-NEXT: sete %cl
	; CHECK-NEXT: movzbl %cl, %ecx
	; CHECK-NEXT: xorl $1, %eax
	; CHECK-NEXT: orl %ecx, %eax
	; CHECK-NEXT: je .LBB5_5
	; CHECK-NEXT: # %bb.3: # %if.then
	; CHECK-NEXT: movl $1, %eax
	; CHECK-NEXT: jmp .LBB5_4
	; CHECK-NEXT: .LBB5_5: # %if.end
	; CHECK-NEXT: xorl %eax, %eax			; CHECK-NEXT: xorl %eax, %eax
	; CHECK-NEXT: .LBB5_4: # %if.then			; CHECK-NEXT: .LBB5_2: # %if.then
	; CHECK-NEXT: movl %ebp, %esp			; CHECK-NEXT: movl %ebp, %esp
	; CHECK-NEXT: popl %ebp			; CHECK-NEXT: popl %ebp
	; CHECK-NEXT: .cfi_def_cfa %esp, 4			; CHECK-NEXT: .cfi_def_cfa %esp, 4
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%x = alloca i32, align 4			%x = alloca i32, align 4
	%t = alloca i64, align 8			%t = alloca i64, align 8
	store volatile i32 1, i32* %x, align 4			store volatile i32 1, i32* %x, align 4
	%load = load volatile i32, i32* %x, align 4			%load = load volatile i32, i32* %x, align 4
	Show All 14 Lines

test/CodeGen/X86/pr32282.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx \| FileCheck %s --check-prefix=X86			; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx \| FileCheck %s --check-prefix=X86
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx \| FileCheck %s --check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx \| FileCheck %s --check-prefix=X64

	; Check for assert in foldMaskAndShiftToScale due to out of range mask scaling.			; Check for assert in foldMaskAndShiftToScale due to out of range mask scaling.

	@b = common global i8 zeroinitializer, align 1			@b = common global i8 zeroinitializer, align 1
	@c = common global i8 zeroinitializer, align 1			@c = common global i8 zeroinitializer, align 1
	@d = common global i64 zeroinitializer, align 8			@d = common global i64 zeroinitializer, align 8
	@e = common global i64 zeroinitializer, align 8			@e = common global i64 zeroinitializer, align 8

	define void @foo() {			define void @foo() {
	; X86-LABEL: foo:			; X86-LABEL: foo:
	; X86: # %bb.0:			; X86: # %bb.0:
	; X86-NEXT: pushl %esi
	; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: pushl %eax			; X86-NEXT: pushl %eax
	; X86-NEXT: .cfi_def_cfa_offset 12			; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: .cfi_offset %esi, -8			; X86-NEXT: movl d, %eax
	; X86-NEXT: movl d, %ecx			; X86-NEXT: notl %eax
				; X86-NEXT: movl d+4, %ecx
	; X86-NEXT: notl %ecx			; X86-NEXT: notl %ecx
	; X86-NEXT: movl d+4, %edx			; X86-NEXT: andl $701685459, %ecx # imm = 0x29D2DED3
	; X86-NEXT: notl %edx			; X86-NEXT: andl $-564453154, %eax # imm = 0xDE5B20DE
	; X86-NEXT: andl $701685459, %edx # imm = 0x29D2DED3			; X86-NEXT: shrdl $21, %ecx, %eax
	; X86-NEXT: andl $-564453154, %ecx # imm = 0xDE5B20DE			; X86-NEXT: shrl $21, %ecx
	; X86-NEXT: shrdl $21, %edx, %ecx			; X86-NEXT: andl $-2, %eax
	; X86-NEXT: shrl $21, %edx			; X86-NEXT: xorl %edx, %edx
	; X86-NEXT: xorl %eax, %eax			; X86-NEXT: addl $7, %eax
	; X86-NEXT: testb %al, %al			; X86-NEXT: adcxl %edx, %ecx
	; X86-NEXT: movl %edx, %esi			; X86-NEXT: pushl %ecx
	; X86-NEXT: cmovnel %eax, %esi
	; X86-NEXT: cmovel %ecx, %edx
	; X86-NEXT: andl $-2, %edx
	; X86-NEXT: addl $7, %edx
	; X86-NEXT: adcxl %eax, %esi
	; X86-NEXT: pushl %esi
	; X86-NEXT: .cfi_adjust_cfa_offset 4			; X86-NEXT: .cfi_adjust_cfa_offset 4
	; X86-NEXT: pushl %edx			; X86-NEXT: pushl %eax
	; X86-NEXT: .cfi_adjust_cfa_offset 4			; X86-NEXT: .cfi_adjust_cfa_offset 4
	; X86-NEXT: pushl $0			; X86-NEXT: pushl $0
	; X86-NEXT: .cfi_adjust_cfa_offset 4			; X86-NEXT: .cfi_adjust_cfa_offset 4
	; X86-NEXT: pushl $0			; X86-NEXT: pushl $0
	; X86-NEXT: .cfi_adjust_cfa_offset 4			; X86-NEXT: .cfi_adjust_cfa_offset 4
	; X86-NEXT: calll __divdi3			; X86-NEXT: calll __divdi3
	; X86-NEXT: addl $16, %esp			; X86-NEXT: addl $16, %esp
	; X86-NEXT: .cfi_adjust_cfa_offset -16			; X86-NEXT: .cfi_adjust_cfa_offset -16
	; X86-NEXT: orl %eax, %edx			; X86-NEXT: orl %eax, %edx
	; X86-NEXT: setne {{[0-9]+}}(%esp)			; X86-NEXT: setne {{[0-9]+}}(%esp)
	; X86-NEXT: addl $4, %esp			; X86-NEXT: popl %eax
	; X86-NEXT: .cfi_def_cfa_offset 8
	; X86-NEXT: popl %esi
	; X86-NEXT: .cfi_def_cfa_offset 4			; X86-NEXT: .cfi_def_cfa_offset 4
	; X86-NEXT: retl			; X86-NEXT: retl
	;			;
	; X64-LABEL: foo:			; X64-LABEL: foo:
	; X64: # %bb.0:			; X64: # %bb.0:
	; X64-NEXT: movq {{.*}}(%rip), %rax			; X64-NEXT: movq {{.*}}(%rip), %rax
	; X64-NEXT: movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000			; X64-NEXT: movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000
	; X64-NEXT: andnq %rcx, %rax, %rcx			; X64-NEXT: andnq %rcx, %rax, %rcx
	▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines