This is an archive of the discontinued LLVM Phabricator instance.

machine-instruction combiner uses too-simple algorithm to compute cost of one of the two alternatives, throws away combination results too often
ClosedPublic

Authored by Abe on Dec 7 2016, 3:35 PM.

Download Raw Diff

Details

Reviewers

spatel
sebpop
Gerolf

Summary

We have found that -- when the selected subarchitecture has a scheduling model and we are not optimizing for size -- the machine-instruction combiner uses a too-simple algorithm to compute the cost of one of the two alternatives [before and after running a combining pass on a section of code], and therefor it throws away the combination results too often.

This fix has the potential to help any ISA with the potential to combine instructions and for which at least one subarchitecture has a scheduling model.

As of now, this is only known to definitely affect AArch64 subarchitectures with a scheduling model.

Patch by Abe Skolnik and Sebastian Pop.

Regression tested on AMD64/GNU-Linux, new test case tested to fail on an unpatched compiler and pass on a patched compiler.

Diff Detail

Event Timeline

Abe updated this revision to Diff 80676.Dec 7 2016, 3:35 PM

Abe retitled this revision from to machine-instruction combiner uses too-simple algorithm to compute cost of one of the two alternatives, throws away combination results too often.

Abe updated this object.

Abe added reviewers: Gerolf, spatel.

Abe set the repository for this revision to rL LLVM.

Abe added a subscriber: sebpop.

Herald added a subscriber: aemerson. · View Herald TranscriptDec 7 2016, 3:35 PM

Abe updated this object.Dec 7 2016, 3:38 PM

Good catch. LGTM.

evandro added a subscriber: evandro.Dec 8 2016, 3:10 PM

Committed as r289399.

This revision is now accepted and ready to land.Dec 11 2016, 11:51 AM

sebpop closed this revision.Dec 11 2016, 11:52 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

MachineCombiner.cpp

11 lines

test/

CodeGen/

AArch64/

arm64-fma-combines.ll

2 lines

machine-combiner_madd_during_address_computation.ll

66 lines

mul-lohi.ll

26 lines

Diff 80676

llvm/lib/CodeGen/MachineCombiner.cpp

Show First 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineTraceMetrics::Trace BlockTrace);		MachineTraceMetrics::Trace BlockTrace);
unsigned getLatency(MachineInstr Root, MachineInstr NewRoot,		unsigned getLatency(MachineInstr Root, MachineInstr NewRoot,
MachineTraceMetrics::Trace BlockTrace);		MachineTraceMetrics::Trace BlockTrace);
bool		bool
improvesCriticalPathLen(MachineBasicBlock MBB, MachineInstr Root,		improvesCriticalPathLen(MachineBasicBlock MBB, MachineInstr Root,
MachineTraceMetrics::Trace BlockTrace,		MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,		SmallVectorImpl<MachineInstr *> &InsInstrs,
		SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern);		MachineCombinerPattern Pattern);
bool preservesResourceLen(MachineBasicBlock *MBB,		bool preservesResourceLen(MachineBasicBlock *MBB,
MachineTraceMetrics::Trace BlockTrace,		MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,		SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs);		SmallVectorImpl<MachineInstr *> &DelInstrs);
void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,		void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);		SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
▲ Show 20 Lines • Show All 155 Lines • ▼ Show 20 Lines
/// The new code sequence ends in MI NewRoot. A necessary condition for the new		/// The new code sequence ends in MI NewRoot. A necessary condition for the new
/// sequence to replace the old sequence is that it cannot lengthen the critical		/// sequence to replace the old sequence is that it cannot lengthen the critical
/// path. The definition of "improve" may be restricted by specifying that the		/// path. The definition of "improve" may be restricted by specifying that the
/// new path improves the data dependency chain (MustReduceDepth).		/// new path improves the data dependency chain (MustReduceDepth).
bool MachineCombiner::improvesCriticalPathLen(		bool MachineCombiner::improvesCriticalPathLen(
MachineBasicBlock MBB, MachineInstr Root,		MachineBasicBlock MBB, MachineInstr Root,
MachineTraceMetrics::Trace BlockTrace,		MachineTraceMetrics::Trace BlockTrace,
SmallVectorImpl<MachineInstr *> &InsInstrs,		SmallVectorImpl<MachineInstr *> &InsInstrs,
		SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,		DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
MachineCombinerPattern Pattern) {		MachineCombinerPattern Pattern) {
assert(TSchedModel.hasInstrSchedModelOrItineraries() &&		assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
"Missing machine model\n");		"Missing machine model\n");
// NewRoot is the last instruction in the \p InsInstrs vector.		// NewRoot is the last instruction in the \p InsInstrs vector.
unsigned NewRootIdx = InsInstrs.size() - 1;		unsigned NewRootIdx = InsInstrs.size() - 1;
MachineInstr *NewRoot = InsInstrs[NewRootIdx];		MachineInstr *NewRoot = InsInstrs[NewRootIdx];

Show All 11 Lines	bool MachineCombiner::improvesCriticalPathLen(
// Being conservative also protects against inaccuracies in the underlying		// Being conservative also protects against inaccuracies in the underlying
// machine trace metrics and CPU models.		// machine trace metrics and CPU models.
if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)		if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
return NewRootDepth < RootDepth;		return NewRootDepth < RootDepth;

// A more flexible cost calculation for the critical path includes the slack		// A more flexible cost calculation for the critical path includes the slack
// of the original code sequence. This may allow the transform to proceed		// of the original code sequence. This may allow the transform to proceed
// even if the instruction depths (data dependency cycles) become worse.		// even if the instruction depths (data dependency cycles) become worse.

unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);		unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
unsigned RootLatency = TSchedModel.computeInstrLatency(Root);		unsigned RootLatency = 0;

		for (auto I : DelInstrs)
		RootLatency += TSchedModel.computeInstrLatency(I);

unsigned RootSlack = BlockTrace.getInstrSlack(*Root);		unsigned RootSlack = BlockTrace.getInstrSlack(*Root);

DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";		DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
dbgs() << " RootLatency: " << RootLatency << "\n";		dbgs() << " RootLatency: " << RootLatency << "\n";
dbgs() << " RootSlack: " << RootSlack << "\n";		dbgs() << " RootSlack: " << RootSlack << "\n";
dbgs() << " NewRootDepth + NewRootLatency = "		dbgs() << " NewRootDepth + NewRootLatency = "
<< NewRootDepth + NewRootLatency << "\n";		<< NewRootDepth + NewRootLatency << "\n";
dbgs() << " RootDepth + RootLatency + RootSlack = "		dbgs() << " RootDepth + RootLatency + RootSlack = "
▲ Show 20 Lines • Show All 134 Lines • ▼ Show 20 Lines	for (auto P : Patterns) {
SubstituteAlways = true;		SubstituteAlways = true;

// Substitute when we optimize for codesize and the new sequence has		// Substitute when we optimize for codesize and the new sequence has
// fewer instructions OR		// fewer instructions OR
// the new sequence neither lengthens the critical path nor increases		// the new sequence neither lengthens the critical path nor increases
// resource pressure.		// resource pressure.
if (SubstituteAlways \|\| doSubstitute(NewInstCount, OldInstCount) \|\|		if (SubstituteAlways \|\| doSubstitute(NewInstCount, OldInstCount) \|\|
(improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,		(improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
InstrIdxForVirtReg, P) &&		DelInstrs, InstrIdxForVirtReg, P) &&
preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {		preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
for (auto *InstrPtr : InsInstrs)		for (auto *InstrPtr : InsInstrs)
MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);		MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
for (auto *InstrPtr : DelInstrs)		for (auto *InstrPtr : DelInstrs)
InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();		InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();

Changed = true;		Changed = true;
++NumInstCombined;		++NumInstCombined;
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

	; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math \| FileCheck %s			; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math \| FileCheck %s
	define void @foo_2d(double* %src) {			define void @foo_2d(double* %src) {
	; CHECK-LABEL: %entry			; CHECK-LABEL: %entry
	; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}			; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
	; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}			; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
	entry:			entry:
	%arrayidx1 = getelementptr inbounds double, double* %src, i64 5			%arrayidx1 = getelementptr inbounds double, double* %src, i64 5
	%arrayidx2 = getelementptr inbounds double, double* %src, i64 11			%arrayidx2 = getelementptr inbounds double, double* %src, i64 11
	%tmp = bitcast double* %arrayidx1 to <2 x double>*			%tmp = bitcast double* %arrayidx1 to <2 x double>*
	%tmp1 = load double, double* %arrayidx2, align 8			%tmp1 = load double, double* %arrayidx2, align 8
	%tmp2 = load double, double* %arrayidx1, align 8			%tmp2 = load double, double* %arrayidx1, align 8
	%fmul = fmul fast double %tmp1, %tmp1			%fmul = fmul fast double %tmp1, %tmp1
	%fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B			%fmul2 = fmul fast double %tmp2, 0x3F94AFD6A052BF5B
	▲ Show 20 Lines • Show All 123 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/machine-combiner_madd_during_address_computation.ll

This file was added.

				; Converted from a machine-reduced C++ test case because hand-written test cases passed without first improving the compiler.

				; test all AArch64 subarches known as of Dec. 7 2016 to have scheduling models
				; ----------------------------------------------------------------------------
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cyclone -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=exynos-m2 -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=kryo -o - %s \| FileCheck %s
				; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=vulcan -o - %s \| FileCheck %s

				; CHECK-NOT: mul
				; CHECK: madd
				; CHECK-NOT: mul

				target triple = "aarch64-sarc-linux-gnu"

				%class.BtlConfig = type { %class.C }
				%class.C = type { %class.B }
				%class.B = type { %class.D* }
				%class.D = type { %class.basic_string.base, [4 x i8] }
				%class.basic_string.base = type <{ i64, i64, i32 }>
				%class.basic_string = type <{ i64, i64, i32, [4 x i8] }>
				@a = global %class.BtlConfig zeroinitializer, align 8
				@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
				declare i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C*) local_unnamed_addr
				declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
				define internal void @_GLOBAL__sub_I_main_adv.ii() section ".text.startup" {
				entry:
				%tmp.i.i = alloca %class.D, align 8
				%agg.tmp.i.i = alloca %class.D, align 8
				%0 = bitcast %class.D* %tmp.i.i to i8*
				%1 = bitcast %class.D* %agg.tmp.i.i to i8*
				%call8.i.i = tail call i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0))
				%cmp9.i.i = icmp sgt i64 %call8.i.i, 0
				br i1 %cmp9.i.i, label %for.body.lr.ph.i.i, label %__cxx_global_var_init.exit
				for.body.lr.ph.i.i:
				%2 = bitcast %class.D* %agg.tmp.i.i to %class.basic_string*
				br label %for.body.i.i
				for.body.i.i:
				%conv11.i.i = phi i64 [ 0, %for.body.lr.ph.i.i ], [ %conv.i.i, %for.body.i.i ]
				%i.010.i.i = phi i32 [ undef, %for.body.lr.ph.i.i ], [ %inc.i.i, %for.body.i.i ]
				%3 = load %class.D, %class.D* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0, i32 0, i32 0), align 8, !tbaa !1, !noalias !6
				%arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %3, i64 %conv11.i.i
				%4 = bitcast %class.D* %arrayidx.i.i.i to i8*
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* %4, i64 24, i32 8, i1 false)
				%inc.i.i = add i32 %i.010.i.i, 1
				%conv.i.i = zext i32 %inc.i.i to i64
				%call.i.i = call i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0))
				%cmp.i.i = icmp slt i64 %conv.i.i, %call.i.i
				br i1 %cmp.i.i, label %for.body.i.i, label %__cxx_global_var_init.exit.loopexit
				__cxx_global_var_init.exit.loopexit:
				br label %__cxx_global_var_init.exit
				__cxx_global_var_init.exit:
				ret void
				}
				!1 = !{!2, !3, i64 0}
				!2 = !{!"foo", !3, i64 0}
				!3 = !{!"bar", !4, i64 0}
				!4 = !{!"baz", !5, i64 0}
				!5 = !{!"boo"}
				!6 = !{!7}
				!7 = distinct !{!7, !8, !"_ZN1CI1D1AIS0_EEixEl: %agg.result"}
				!8 = distinct !{!8, !"_ZN1CI1D1AIS0_EEixEl"}

llvm/test/CodeGen/AArch64/mul-lohi.ll

	; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - \| FileCheck %s			; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - \| FileCheck %s
	; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - \| FileCheck --check-prefix=CHECK-BE %s			; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - \| FileCheck --check-prefix=CHECK-BE %s

	define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {			define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
	; CHECK-LABEL: test_128bitmul:			; CHECK-LABEL: test_128bitmul:
	; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3			; CHECK: umulh [[HI:x[0-9]+]], x0, x2
	; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2			; CHECK: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
	; CHECK: mul [[PART2:x[0-9]+]], x1, x2			; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]
	; CHECK: mul x0, x0, x2			; CHECK-DAG: mul x0, x0, x2
				; CHECK-NEXT: ret

	; CHECK-BE-LABEL: test_128bitmul:			; CHECK-BE-LABEL: test_128bitmul:
	; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2			; CHECK-BE: umulh [[HI:x[0-9]+]], x1, x3
	; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3			; CHECK-BE: madd [[TEMP1:x[0-9]+]], x1, x2, [[HI]]
	; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3			; CHECK-BE-DAG: madd x0, x0, x3, [[TEMP1]]
	; CHECK-BE: mul x1, x1, x3			; CHECK-BE-DAG: mul x1, x1, x3
				; CHECK-BE-NEXT: ret

	%prod = mul i128 %lhs, %rhs			%prod = mul i128 %lhs, %rhs
	ret i128 %prod			ret i128 %prod
	}			}

	; The machine combiner should create madd instructions when			; The machine combiner should create madd instructions when
	; optimizing for size because that's smaller than mul + add.			; optimizing for size because that's smaller than mul + add.

	define i128 @test_128bitmul_optsize(i128 %lhs, i128 %rhs) optsize {			define i128 @test_128bitmul_optsize(i128 %lhs, i128 %rhs) optsize {
	; CHECK-LABEL: test_128bitmul_optsize:			; CHECK-LABEL: test_128bitmul_optsize:
	; CHECK: umulh [[HI:x[0-9]+]], x0, x2			; CHECK: umulh [[HI:x[0-9]+]], x0, x2
	; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]			; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
	; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]]			; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]
	; CHECK-NEXT: mul x0, x0, x2			; CHECK-DAG: mul x0, x0, x2
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	%prod = mul i128 %lhs, %rhs			%prod = mul i128 %lhs, %rhs
	ret i128 %prod			ret i128 %prod
	}			}

	define i128 @test_128bitmul_minsize(i128 %lhs, i128 %rhs) minsize {			define i128 @test_128bitmul_minsize(i128 %lhs, i128 %rhs) minsize {
	; CHECK-LABEL: test_128bitmul_minsize:			; CHECK-LABEL: test_128bitmul_minsize:
	; CHECK: umulh [[HI:x[0-9]+]], x0, x2			; CHECK: umulh [[HI:x[0-9]+]], x0, x2
	; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]			; CHECK-NEXT: madd [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
	; CHECK-NEXT: madd x1, x1, x2, [[TEMP1]]			; CHECK-DAG: madd x1, x1, x2, [[TEMP1]]
	; CHECK-NEXT: mul x0, x0, x2			; CHECK-DAG: mul x0, x0, x2
	; CHECK-NEXT: ret			; CHECK-NEXT: ret

	%prod = mul i128 %lhs, %rhs			%prod = mul i128 %lhs, %rhs
	ret i128 %prod			ret i128 %prod
	}			}