Skip to content

Commit db3f977

Browse files
committedJan 25, 2019
[HotColdSplit] Introduce a cost model to control splitting behavior
The main goal of the model is to avoid *increasing* function size, as that would eradicate any memory locality benefits from splitting. This happens when: - There are too many inputs or outputs to the cold region. Argument materialization and reloads of outputs have a cost. - The cold region has too many distinct exit blocks, causing a large switch to be formed in the caller. - The code size cost of the split code is less than the cost of a set-up call. A secondary goal is to prevent excessive overall binary size growth. With the cost model in place, I experimented to find a splitting threshold that works well in practice. To make warm & cold code easily separable for analysis purposes, I moved split functions to a "cold" section. I experimented with thresholds between [0, 4] and set the default to the threshold which minimized geomean __text size. Experiment data from building LNT+externals for X86 (N = 639 programs, all sizes in bytes): | Configuration | __text geom size | __cold geom size | TEXT geom size | | **-Os** | 1736.3 | 0, n=0 | 10961.6 | | -Os, thresh=0 | 1740.53 | 124.482, n=134 | 11014 | | -Os, thresh=1 | 1734.79 | 57.8781, n=90 | 10978.6 | | -Os, thresh=2 | ** 1733.85 ** | 65.6604, n=61 | 10977.6 | | -Os, thresh=3 | 1733.85 | 65.3071, n=61 | 10977.6 | | -Os, thresh=4 | 1735.08 | 67.5156, n=54 | 10965.7 | | **-Oz** | 1554.4 | 0, n=0 | 10153 | | -Oz, thresh=2 | ** 1552.2 ** | 65.633, n=61 | 10176 | | **-O3** | 2563.37 | 0, n=0 | 13105.4 | | -O3, thresh=2 | ** 2559.49 ** | 71.1072, n=61 | 13162.4 | Picking thresh=2 reduces the geomean __text section size by 0.14% at -Os, -Oz, and -O3 and causes ~0.2% growth in the TEXT segment. Note that TEXT size is page-aligned, whereas section sizes are byte-aligned. Experiment data from building LNT+externals for ARM64 (N = 558 programs, all sizes in bytes): | Configuration | __text geom size | __cold geom size | TEXT geom size | | **-Os** | 1763.96 | 0, n=0 | 42934.9 | | -Os, thresh=2 | ** 1760.9 ** | 76.6755, n=61 | 42934.9 | Picking thresh=2 reduces the geomean __text section size by 0.17% at -Os and causes no growth in the TEXT segment. Measurements were done with D57082 (r352080) applied. Differential Revision: https://reviews.llvm.org/D57125 llvm-svn: 352228
1 parent 13ef84f commit db3f977

11 files changed

+216
-129
lines changed
 

‎llvm/lib/Transforms/IPO/HotColdSplitting.cpp

+91-36
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@ static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
8080
cl::init(true), cl::Hidden);
8181

8282
static cl::opt<int>
83-
SplittingThreshold("hotcoldsplit-threshold", cl::init(3), cl::Hidden,
84-
cl::desc("Code size threshold for splitting cold code "
85-
"(as a multiple of TCC_Basic)"));
83+
SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
84+
cl::desc("Base penalty for splitting cold code (as a "
85+
"multiple of TCC_Basic)"));
8686

8787
namespace {
8888

@@ -139,31 +139,6 @@ static bool mayExtractBlock(const BasicBlock &BB) {
139139
!isa<InvokeInst>(BB.getTerminator());
140140
}
141141

142-
/// Check whether \p Region is profitable to outline.
143-
static bool isProfitableToOutline(const BlockSequence &Region,
144-
TargetTransformInfo &TTI) {
145-
// If the splitting threshold is set at or below zero, skip the usual
146-
// profitability check.
147-
if (SplittingThreshold <= 0)
148-
return true;
149-
150-
if (Region.size() > 1)
151-
return true;
152-
153-
int Cost = 0;
154-
const BasicBlock &BB = *Region[0];
155-
for (const Instruction &I : BB) {
156-
if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
157-
continue;
158-
159-
Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
160-
161-
if (Cost >= (SplittingThreshold * TargetTransformInfo::TCC_Basic))
162-
return true;
163-
}
164-
return false;
165-
}
166-
167142
/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
168143
/// Return true if the function is changed.
169144
static bool markFunctionCold(Function &F) {
@@ -247,6 +222,82 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
247222
return true;
248223
}
249224

225+
/// Get the benefit score of outlining \p Region.
226+
static int getOutliningBenefit(ArrayRef<BasicBlock *> Region,
227+
TargetTransformInfo &TTI) {
228+
// Sum up the code size costs of non-terminator instructions. Tight coupling
229+
// with \ref getOutliningPenalty is needed to model the costs of terminators.
230+
int Benefit = 0;
231+
for (BasicBlock *BB : Region)
232+
for (Instruction &I : BB->instructionsWithoutDebug())
233+
if (&I != BB->getTerminator())
234+
Benefit +=
235+
TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
236+
237+
return Benefit;
238+
}
239+
240+
/// Get the penalty score for outlining \p Region.
241+
static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
242+
unsigned NumInputs, unsigned NumOutputs) {
243+
int Penalty = SplittingThreshold;
244+
LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
245+
246+
// If the splitting threshold is set at or below zero, skip the usual
247+
// profitability check.
248+
if (SplittingThreshold <= 0)
249+
return Penalty;
250+
251+
// The typical code size cost for materializing an argument for the outlined
252+
// call.
253+
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumInputs << " inputs\n");
254+
const int CostForArgMaterialization = TargetTransformInfo::TCC_Basic;
255+
Penalty += CostForArgMaterialization * NumInputs;
256+
257+
// The typical code size cost for an output alloca, its associated store, and
258+
// its associated reload.
259+
LLVM_DEBUG(dbgs() << "Applying penalty for: " << NumOutputs << " outputs\n");
260+
const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
261+
Penalty += CostForRegionOutput * NumOutputs;
262+
263+
// Find the number of distinct exit blocks for the region. Use a conservative
264+
// check to determine whether control returns from the region.
265+
bool NoBlocksReturn = true;
266+
SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
267+
for (BasicBlock *BB : Region) {
268+
// If a block has no successors, only assume it does not return if it's
269+
// unreachable.
270+
if (succ_empty(BB)) {
271+
NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
272+
continue;
273+
}
274+
275+
for (BasicBlock *SuccBB : successors(BB)) {
276+
if (find(Region, SuccBB) == Region.end()) {
277+
NoBlocksReturn = false;
278+
SuccsOutsideRegion.insert(SuccBB);
279+
}
280+
}
281+
}
282+
283+
// Apply a `noreturn` bonus.
284+
if (NoBlocksReturn) {
285+
LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
286+
<< " non-returning terminators\n");
287+
Penalty -= Region.size();
288+
}
289+
290+
// Apply a penalty for having more than one successor outside of the region.
291+
// This penalty accounts for the switch needed in the caller.
292+
if (!SuccsOutsideRegion.empty()) {
293+
LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
294+
<< " non-region successors\n");
295+
Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
296+
}
297+
298+
return Penalty;
299+
}
300+
250301
Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
251302
DominatorTree &DT,
252303
BlockFrequencyInfo *BFI,
@@ -261,6 +312,18 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
261312
/* AllowAlloca */ false,
262313
/* Suffix */ "cold." + std::to_string(Count));
263314

315+
// Perform a simple cost/benefit analysis to decide whether or not to permit
316+
// splitting.
317+
SetVector<Value *> Inputs, Outputs, Sinks;
318+
CE.findInputsOutputs(Inputs, Outputs, Sinks);
319+
int OutliningBenefit = getOutliningBenefit(Region, TTI);
320+
int OutliningPenalty =
321+
getOutliningPenalty(Region, Inputs.size(), Outputs.size());
322+
LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
323+
<< ", penalty = " << OutliningPenalty << "\n");
324+
if (OutliningBenefit <= OutliningPenalty)
325+
return nullptr;
326+
264327
Function *OrigF = Region[0]->getParent();
265328
if (Function *OutF = CE.extractCodeRegion()) {
266329
User *U = *OutF->user_begin();
@@ -556,14 +619,6 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
556619
assert(!Region.empty() && "Empty outlining region in worklist");
557620
do {
558621
BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
559-
if (!isProfitableToOutline(SubRegion, TTI)) {
560-
LLVM_DEBUG({
561-
dbgs() << "Skipping outlining; not profitable to outline\n";
562-
SubRegion[0]->dump();
563-
});
564-
continue;
565-
}
566-
567622
LLVM_DEBUG({
568623
dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
569624
for (BasicBlock *BB : SubRegion)

‎llvm/test/Transforms/HotColdSplit/X86/extraction-subregion-breaks-phis.ll

-63
This file was deleted.

‎llvm/test/Transforms/HotColdSplit/X86/outline-expensive.ll

-25
This file was deleted.

‎llvm/test/Transforms/HotColdSplit/addr-taken.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s
1+
; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s
22

33
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
44
target triple = "x86_64-apple-macosx10.14.0"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
3+
4+
declare void @sink() cold
5+
6+
define void @foo(i32 %arg) {
7+
entry:
8+
br i1 undef, label %cold1, label %exit
9+
10+
cold1:
11+
; CHECK: Applying bonus for: 4 non-returning terminators
12+
call void @sink()
13+
br i1 undef, label %cold2, label %cold3
14+
15+
cold2:
16+
br label %cold4
17+
18+
cold3:
19+
br label %cold4
20+
21+
cold4:
22+
unreachable
23+
24+
exit:
25+
ret void
26+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
3+
4+
declare void @sink(i32*, i32, i32) cold
5+
6+
@g = global i32 0
7+
8+
define void @foo(i32 %arg) {
9+
%local = load i32, i32* @g
10+
br i1 undef, label %cold, label %exit
11+
12+
cold:
13+
; CHECK: Applying penalty for: 2 inputs
14+
call void @sink(i32* @g, i32 %arg, i32 %local)
15+
ret void
16+
17+
exit:
18+
ret void
19+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
3+
4+
declare void @sink() cold
5+
6+
@g = global i32 0
7+
8+
define i32 @foo(i32 %arg) {
9+
entry:
10+
br i1 undef, label %cold, label %exit
11+
12+
cold:
13+
; CHECK: Applying penalty for: 1 output
14+
; CHECK: Applying penalty for: 1 non-region successors
15+
%local = load i32, i32* @g
16+
call void @sink()
17+
br label %exit
18+
19+
exit:
20+
%p = phi i32 [ %local, %cold ], [ 0, %entry ]
21+
ret i32 %p
22+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
; REQUIRES: asserts
2+
; RUN: opt -hotcoldsplit -debug-only=hotcoldsplit -S < %s -o /dev/null 2>&1 | FileCheck %s
3+
4+
declare void @sink() cold
5+
6+
; CHECK-LABEL: Outlining in one_non_region_successor
7+
define void @one_non_region_successor(i32 %arg) {
8+
entry:
9+
br i1 undef, label %cold1, label %exit
10+
11+
cold1:
12+
; CHECK: Applying penalty for: 1 non-region successor
13+
call void @sink()
14+
br i1 undef, label %cold2, label %cold3
15+
16+
cold2:
17+
br i1 undef, label %cold4, label %exit
18+
19+
cold3:
20+
br i1 undef, label %cold4, label %exit
21+
22+
cold4:
23+
unreachable
24+
25+
exit:
26+
ret void
27+
}
28+
29+
; CHECK-LABEL: Outlining in two_non_region_successor
30+
define void @two_non_region_successors(i32 %arg) {
31+
entry:
32+
br i1 undef, label %cold1, label %exit1
33+
34+
cold1:
35+
; CHECK: Applying penalty for: 2 non-region successors
36+
call void @sink()
37+
br i1 undef, label %cold2, label %cold3
38+
39+
cold2:
40+
br i1 undef, label %cold4, label %exit1
41+
42+
cold3:
43+
br i1 undef, label %cold4, label %exit2
44+
45+
cold4:
46+
unreachable
47+
48+
exit1:
49+
br label %exit2
50+
51+
exit2:
52+
ret void
53+
}

‎llvm/test/Transforms/HotColdSplit/outline-disjoint-diamonds.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=0 < %s 2>&1 | FileCheck %s
1+
; RUN: opt -S -hotcoldsplit -hotcoldsplit-threshold=-1 < %s 2>&1 | FileCheck %s
22

33
; CHECK-LABEL: define {{.*}}@fun
44
; CHECK: call {{.*}}@fun.cold.2(

‎llvm/test/Transforms/HotColdSplit/resume.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s
1+
; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -S < %s | FileCheck %s
22

33
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
44
target triple = "x86_64-apple-macosx10.14.0"

‎llvm/test/Transforms/HotColdSplit/split-cold-2.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
2-
; RUN: opt -hotcoldsplit-threshold=0 -passes=hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
1+
; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=-1 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
2+
; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
33

44
; Make sure this compiles. This test used to fail with an invalid phi node: the
55
; two predecessors were outlined and the SSA representation was invalid.

0 commit comments

Comments
 (0)
Please sign in to comment.