This is an archive of the discontinued LLVM Phabricator instance.

[CodeGenPrepare] Disable div bypass when working set size is huge.
ClosedPublic

Authored by eraman on Nov 13 2017, 5:16 PM.

Download Raw Diff

Details

Reviewers

Commits

rG0d55b55bb616: [CodeGenPrepare] Disable div bypass when working set size is huge.
rL318179: [CodeGenPrepare] Disable div bypass when working set size is huge.

Summary

Bypass of slow divs based on operand values is currently disabled for
-Os. Do the same when profile summary is available and the working set
size of the application is huge. This is similar to how loop peeling is
guarded by hasHugeWorkingSetSize. In the div bypass case, the generated
extra code (and the extra branch) tendss to outweigh the benefits of the
bypass. This results in noticeable performance improvement on an
internal application.

Diff Detail

Repository: rL LLVM

Event Timeline

eraman created this revision.Nov 13 2017, 5:16 PM

Harbormaster completed remote builds in B12141: Diff 122753.Nov 13 2017, 5:18 PM

It seems like OptSize and hasHugeWorkingSetSize overlap a lot (a lot of large C++ codebases use -Os to help fit into the icache). Would it make sense to automatically add OptSize markings based on profile data, or something like that, rather than adding hasHugeWorkingSetSize() checks all over the compiler?

hasHugeWorkingSetSize () can be used to selectively disable size increase transformations while Os can be more aggressive at the cost of performance given its mission, so I don't think we should piggyback on Os for performance purpose.

lgtm

This revision is now accepted and ready to land.Nov 14 2017, 9:49 AM

Closed by commit rL318179: [CodeGenPrepare] Disable div bypass when working set size is huge. (authored by eraman). · Explain WhyNov 14 2017, 11:32 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

CodeGenPrepare.cpp

7 lines

test/

CodeGen/

X86/

bypass-slow-division-tune.ll

28 lines

Diff 122886

llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp

Show First 20 Lines • Show All 347 Lines • ▼ Show 20 Lines	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
TLI = SubtargetInfo->getTargetLowering();		TLI = SubtargetInfo->getTargetLowering();
TRI = SubtargetInfo->getRegisterInfo();		TRI = SubtargetInfo->getRegisterInfo();
}		}
TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();		TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);		TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();		LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
OptSize = F.optForSize();		OptSize = F.optForSize();

if (ProfileGuidedSectionPrefix) {
ProfileSummaryInfo *PSI =		ProfileSummaryInfo *PSI =
getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();		getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
		if (ProfileGuidedSectionPrefix) {
if (PSI->isFunctionHotInCallGraph(&F))		if (PSI->isFunctionHotInCallGraph(&F))
F.setSectionPrefix(".hot");		F.setSectionPrefix(".hot");
else if (PSI->isFunctionColdInCallGraph(&F))		else if (PSI->isFunctionColdInCallGraph(&F))
F.setSectionPrefix(".unlikely");		F.setSectionPrefix(".unlikely");
}		}

/// This optimization identifies DIV instructions that can be		/// This optimization identifies DIV instructions that can be
/// profitably bypassed and carried out with a shorter, faster divide.		/// profitably bypassed and carried out with a shorter, faster divide.
if (!OptSize && TLI && TLI->isSlowDivBypassed()) {		if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&
		TLI->isSlowDivBypassed()) {
const DenseMap<unsigned int, unsigned int> &BypassWidths =		const DenseMap<unsigned int, unsigned int> &BypassWidths =
TLI->getBypassSlowDivWidths();		TLI->getBypassSlowDivWidths();
BasicBlock* BB = &*F.begin();		BasicBlock* BB = &*F.begin();
while (BB != nullptr) {		while (BB != nullptr) {
// bypassSlowDivision may create new BBs, but we don't want to reapply the		// bypassSlowDivision may create new BBs, but we don't want to reapply the
// optimization to those blocks.		// optimization to those blocks.
BasicBlock* Next = BB->getNextNode();		BasicBlock* Next = BB->getNextNode();
EverMadeChange \|= bypassSlowDivision(BB, BypassWidths);		EverMadeChange \|= bypassSlowDivision(BB, BypassWidths);
▲ Show 20 Lines • Show All 6,299 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/bypass-slow-division-tune.ll

	; Check that a division is bypassed when appropriate only.			; Check that a division is bypassed when appropriate only.
	; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=atom < %s \| FileCheck -check-prefixes=ATOM,CHECK %s			; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=atom < %s \| FileCheck -check-prefixes=ATOM,CHECK %s
	; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s \| FileCheck -check-prefixes=REST,CHECK %s			; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s \| FileCheck -check-prefixes=REST,CHECK %s
	; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck -check-prefixes=REST,CHECK %s			; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck -check-prefixes=REST,CHECK %s
				; RUN: llc -profile-summary-huge-working-set-size-threshold=1 -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s \| FileCheck -check-prefixes=HUGEWS %s

	; Verify that div32 is bypassed only for Atoms.			; Verify that div32 is bypassed only for Atoms.
	define i32 @div32(i32 %a, i32 %b) {			define i32 @div32(i32 %a, i32 %b) {
	entry:			entry:
	; ATOM-LABEL: div32:			; ATOM-LABEL: div32:
	; ATOM: orl %{{.*}}, [[REG:%[a-z]+]]			; ATOM: orl %{{.*}}, [[REG:%[a-z]+]]
	; ATOM: testl $-256, [[REG]]			; ATOM: testl $-256, [[REG]]
	; ATOM: divb			; ATOM: divb
	Show All 18 Lines
	}			}


	; Verify that no extra code is generated when optimizing for size.			; Verify that no extra code is generated when optimizing for size.

	define i64 @div64_optsize(i64 %a, i64 %b) optsize {			define i64 @div64_optsize(i64 %a, i64 %b) optsize {
	; CHECK-LABEL: div64_optsize:			; CHECK-LABEL: div64_optsize:
	; CHECK-NOT: divl			; CHECK-NOT: divl
				; CHECK: ret
				%div = sdiv i64 %a, %b
				ret i64 %div
				}

				define i64 @div64_hugews(i64 %a, i64 %b) {
				; HUGEWS-LABEL: div64_hugews:
				; HUGEWS-NOT: divl
				; HUGEWS: ret
	%div = sdiv i64 %a, %b			%div = sdiv i64 %a, %b
	ret i64 %div			ret i64 %div
	}			}

	define i32 @div32_optsize(i32 %a, i32 %b) optsize {			define i32 @div32_optsize(i32 %a, i32 %b) optsize {
	; CHECK-LABEL: div32_optsize:			; CHECK-LABEL: div32_optsize:
	; CHECK-NOT: divb			; CHECK-NOT: divb
				; CHECK: ret
	%div = sdiv i32 %a, %b			%div = sdiv i32 %a, %b
	ret i32 %div			ret i32 %div
	}			}

	define i32 @div32_minsize(i32 %a, i32 %b) minsize {			define i32 @div32_minsize(i32 %a, i32 %b) minsize {
	; CHECK-LABEL: div32_minsize:			; CHECK-LABEL: div32_minsize:
	; CHECK-NOT: divb			; CHECK-NOT: divb
				; CHECK: ret
	%div = sdiv i32 %a, %b			%div = sdiv i32 %a, %b
	ret i32 %div			ret i32 %div
	}			}

				!llvm.module.flags = !{!1}
				!1 = !{i32 1, !"ProfileSummary", !2}
				!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
				!3 = !{!"ProfileFormat", !"InstrProf"}
				!4 = !{!"TotalCount", i64 10000}
				!5 = !{!"MaxCount", i64 1000}
				!6 = !{!"MaxInternalCount", i64 1}
				!7 = !{!"MaxFunctionCount", i64 1000}
				!8 = !{!"NumCounts", i64 3}
				!9 = !{!"NumFunctions", i64 3}
				!10 = !{!"DetailedSummary", !11}
				!11 = !{!12, !13, !14}
				!12 = !{i32 10000, i64 1000, i32 1}
				!13 = !{i32 999000, i64 1000, i32 3}
				!14 = !{i32 999999, i64 5, i32 3}