This is an archive of the discontinued LLVM Phabricator instance.

Codegen: LICM Remove check for exactly 1 register def.
ClosedPublic

Authored by iteratee on Jun 16 2016, 1:46 PM.

Download Raw Diff

Details

Reviewers

qcolombet
atrick
sunfish

Summary

When considering whether to split an instruction with a memory operand
into an explicit load and a register-based instruction, we currently
check that the resulting instruction has exactly 1 def. This prevents 2
important LICM optimizations: compares with memory operands, and double
inderect calls. All the tests and the test-suite pass without the check.
My guess as to original intent is to limit the additional register pressure
created by the new instruction, but given that we only split out a single
register, it is already limited.

The licm-dominance test now checks actual memory loads for hoisting instead of
undef, and it tests compares.
hoist-invariant-load.ll now checks for 2 hoists, the intended hoist, and a bonus
from calling a got-relative function in a loop.

Diff Detail

Event Timeline

iteratee updated this revision to Diff 61021.Jun 16 2016, 1:46 PM

iteratee retitled this revision from to Codegen: LICM Remove check for exactly 1 register def..

iteratee updated this object.

iteratee added a reviewer: sunfish.

iteratee set the repository for this revision to rL LLVM.

iteratee added a child revision: D20379: Codegen: Fix broken assumption in Tail Merge..

iteratee added subscribers: echristo, mcrosier, haicheng, llvm-commits.

Corrected the output of the non-hoisted loads.

iteratee added a reviewer: atrick.Jun 21 2016, 4:16 PM

iteratee added a reviewer: qcolombet.

Quentin, Andrew,

Functionally this is a one-line change. When splitting an instruction from a memory op to an explicit load and a register op, we were checking the number of register defs in the resulting register op, and bailing if it wasn't exactly one. I've thought carefully about it, and the check doesn't seem to be necessary, and the tests all pass, but I'd like someone more familiar with that code to look over it.

Thanks,
Kyle.

Frankly I don't understand what MID.getNumDefs() has to do with anything here. So your change looks ok to me. But I'm not clear on how the register form of the instruction ends up with multiple defs--I guess it already had multiple defs. Can you show an example of that in the form of machine instrs?

Sure. I should be able to find an example with multiple defs.

Andrew, there's a test with a multiple register-def instruction in D21627.
The existing tests now cover 0-def instructions, as the double-indirection for the call gets hoisted to a single indirection.

I couldn't put the test in this revision, because memory-refs were not being propagated for the folded load in the instruction in question.

iteratee added a child revision: D21627: Codegen: [X86] preservere memory refs for folded umul_lohi.Jun 22 2016, 3:52 PM

LGTM

This revision is now accepted and ready to land.Jun 23 2016, 2:18 PM

iteratee closed this revision.Jun 23 2016, 2:49 PM

iteratee removed a child revision: D20379: Codegen: Fix broken assumption in Tail Merge..

Revision Contents

Path

Size

lib/

CodeGen/

MachineLICM.cpp

1 line

test/

CodeGen/

X86/

hoist-invariant-load.ll

4 lines

licm-dominance.ll

61 lines

Diff 61044

lib/CodeGen/MachineLICM.cpp

Show First 20 Lines • Show All 1,165 Lines • ▼ Show 20 Lines	MachineInstr MachineLICM::ExtractHoistableLoad(MachineInstr MI) {
unsigned LoadRegIndex;		unsigned LoadRegIndex;
unsigned NewOpc =		unsigned NewOpc =
TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(),		TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(),
/UnfoldLoad=/true,		/UnfoldLoad=/true,
/UnfoldStore=/false,		/UnfoldStore=/false,
&LoadRegIndex);		&LoadRegIndex);
if (NewOpc == 0) return nullptr;		if (NewOpc == 0) return nullptr;
const MCInstrDesc &MID = TII->get(NewOpc);		const MCInstrDesc &MID = TII->get(NewOpc);
if (MID.getNumDefs() != 1) return nullptr;
MachineFunction &MF = *MI->getParent()->getParent();		MachineFunction &MF = *MI->getParent()->getParent();
const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);		const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
// Ok, we're unfolding. Create a temporary register and do the unfold.		// Ok, we're unfolding. Create a temporary register and do the unfold.
unsigned Reg = MRI->createVirtualRegister(RC);		unsigned Reg = MRI->createVirtualRegister(RC);

SmallVector<MachineInstr *, 2> NewMIs;		SmallVector<MachineInstr *, 2> NewMIs;
bool Success =		bool Success =
TII->unfoldMemoryOperand(MF, MI, Reg,		TII->unfoldMemoryOperand(MF, MI, Reg,
▲ Show 20 Lines • Show All 209 Lines • Show Last 20 Lines

test/CodeGen/X86/hoist-invariant-load.ll

	; REQUIRES: asserts			; REQUIRES: asserts
	; RUN: llc < %s -stats -O2 2>&1 \| grep "1 machine-licm"			; RUN: llc < %s -stats -O2 2>&1 \| grep "2 machine-licm"
				; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_
				; and 1 for objc_msgSend from the GOT

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
	target triple = "x86_64-apple-macosx10.7.2"			target triple = "x86_64-apple-macosx10.7.2"

	@"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"foo\00", section "__TEXT,__objc_methname,cstring_literals", align 1			@"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"foo\00", section "__TEXT,__objc_methname,cstring_literals", align 1
	@"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i64 0, i64 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"			@"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i64 0, i64 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
	@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] [i32 0, i32 16], section "__DATA, __objc_imageinfo, regular, no_dead_strip"			@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] [i32 0, i32 16], section "__DATA, __objc_imageinfo, regular, no_dead_strip"
	@llvm.used = appending global [3 x i8] [i8 getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_" to i8), i8 bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*)], section "llvm.metadata"			@llvm.used = appending global [3 x i8] [i8 getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), i8* bitcast (i8** @"\01L_OBJC_SELECTOR_REFERENCES_" to i8), i8 bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*)], section "llvm.metadata"
	Show All 20 Lines

test/CodeGen/X86/licm-dominance.ll

	; RUN: llc -asm-verbose=true < %s \| FileCheck %s			; RUN: llc -asm-verbose=true < %s \| FileCheck %s

	; MachineLICM should check dominance before hoisting instructions.			; MachineLICM should check dominance before hoisting instructions.
				; only the load of a0 is guaranteed to execute, so only it can be hoisted.
				; CHECK: movb (%rdi), [[a0reg:%[a-z0-9]+]]
				; CHECK: ## %for.body.i
				; CHECK: testb [[a0reg]], [[a0reg]]
	; CHECK: ## in Loop:			; CHECK: ## in Loop:
	; CHECK-NEXT: xorl %eax, %eax			; CHECK: cmpb $1
	; CHECK-NEXT: testb %al, %al			; CHECK: cmpb $2
				; CHECK: cmpb $3

	target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"			target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
	target triple = "x86_64-apple-macosx10.7.2"			target triple = "x86_64-apple-macosx10.7.2"

	define void @CMSColorWorldCreateParametricData() nounwind uwtable optsize ssp {			define void @CMSColorWorldCreateParametricData(
				i8* dereferenceable(1) %a0,
				i8* dereferenceable(1) %a1,
				i8* dereferenceable(1) %a2,
				i8* dereferenceable(1) %a3,
				i64 %count) nounwind uwtable optsize ssp readonly {
	entry:			entry:
	br label %for.body.i			br label %for.body.i

	for.body.i:			for.body.i:
	br i1 undef, label %for.inc.i, label %if.then26.i			%i = phi i64 [0, %entry], [%i.inc, %for.inc.i]
				%0 = load i8, i8* %a0, !invariant.load !0
				%cond0 = icmp eq i8 %0, 0
				br i1 %cond0, label %for.inc.i, label %if.then26.i

	if.then26.i:			if.then26.i:
	br i1 undef, label %if.else.i.i, label %lor.lhs.false.i.i			%1 = load i8, i8* %a1, !invariant.load !0
				%cond1 = icmp eq i8 %1, 1
				br i1 %cond1, label %if.else.i.i, label %lor.lhs.false.i.i

	if.else.i.i:			if.else.i.i:
	br i1 undef, label %lor.lhs.false.i.i, label %if.then116.i.i			%2 = load i8, i8* %a2, !invariant.load !0
				%cond2 = icmp eq i8 %2, 2
				br i1 %cond2, label %lor.lhs.false.i.i, label %for.inc.i

	lor.lhs.false.i.i:			lor.lhs.false.i.i:
	br i1 undef, label %for.inc.i, label %if.then116.i.i			%3 = load i8, i8* %a3, !invariant.load !0
				%cond3 = icmp eq i8 %3, 3
	if.then116.i.i:			br i1 %cond3, label %for.inc.i, label %if.end28.i
	unreachable

	for.inc.i:			for.inc.i:
	%cmp17.i = icmp ult i64 undef, undef			%i.inc = add nsw i64 %i, 1
				%cmp17.i = icmp ult i64 %i.inc, %count
	br i1 %cmp17.i, label %for.body.i, label %if.end28.i			br i1 %cmp17.i, label %for.body.i, label %if.end28.i

	if.end28.i:			if.end28.i:
	ret void			ret void
	}			}

				!0 = !{}