Skip to content

Commit 2960d41

Browse files
author
Jun Bum Lim
committedJun 2, 2017
[InlineCost] Enable the new switch cost heuristic
Summary: This is to enable the new switch inline cost heuristic (r301649) by removing the old heuristic as well as the flag itself. In my experiment for LLVM test suite and spec2000/2006, +17.82% performance and 8% code size reduce was observed in spec2000/vertex with O3 LTO in AArch64. No significant code size / performance regression was found in O3/O2/Os. No significant complain was reported from the llvm-dev thread. Reviewers: hans, chandlerc, eraman, haicheng, mcrosier, bmakam, eastig, ddibyend, echristo Reviewed By: echristo Subscribers: javed.absar, kristof.beyls, echristo, aemerson, rengolin, mehdi_amini Differential Revision: https://reviews.llvm.org/D32653 llvm-svn: 304594
1 parent 2c08fde commit 2960d41

File tree

2 files changed

+58
-78
lines changed

2 files changed

+58
-78
lines changed
 

‎llvm/lib/Analysis/InlineCost.cpp

+56-76
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,6 @@ static cl::opt<int>
5454
cl::init(45),
5555
cl::desc("Threshold for inlining cold callsites"));
5656

57-
static cl::opt<bool>
58-
EnableGenericSwitchCost("inline-generic-switch-cost", cl::Hidden,
59-
cl::init(false),
60-
cl::desc("Enable generic switch cost model"));
61-
6257
// We introduce this threshold to help performance of instrumentation based
6358
// PGO before we actually hook up inliner with analysis passes such as BPI and
6459
// BFI.
@@ -1015,83 +1010,68 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
10151010
if (isa<ConstantInt>(V))
10161011
return true;
10171012

1018-
if (EnableGenericSwitchCost) {
1019-
// Assume the most general case where the swith is lowered into
1020-
// either a jump table, bit test, or a balanced binary tree consisting of
1021-
// case clusters without merging adjacent clusters with the same
1022-
// destination. We do not consider the switches that are lowered with a mix
1023-
// of jump table/bit test/binary search tree. The cost of the switch is
1024-
// proportional to the size of the tree or the size of jump table range.
1025-
1026-
// Exit early for a large switch, assuming one case needs at least one
1027-
// instruction.
1028-
// FIXME: This is not true for a bit test, but ignore such case for now to
1029-
// save compile-time.
1030-
int64_t CostLowerBound =
1031-
std::min((int64_t)INT_MAX,
1032-
(int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1033-
1034-
if (CostLowerBound > Threshold) {
1035-
Cost = CostLowerBound;
1036-
return false;
1037-
}
1013+
// Assume the most general case where the swith is lowered into
1014+
// either a jump table, bit test, or a balanced binary tree consisting of
1015+
// case clusters without merging adjacent clusters with the same
1016+
// destination. We do not consider the switches that are lowered with a mix
1017+
// of jump table/bit test/binary search tree. The cost of the switch is
1018+
// proportional to the size of the tree or the size of jump table range.
1019+
//
1020+
// NB: We convert large switches which are just used to initialize large phi
1021+
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1022+
// inlining those. It will prevent inlining in cases where the optimization
1023+
// does not (yet) fire.
10381024

1039-
unsigned JumpTableSize = 0;
1040-
unsigned NumCaseCluster =
1041-
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1025+
// Exit early for a large switch, assuming one case needs at least one
1026+
// instruction.
1027+
// FIXME: This is not true for a bit test, but ignore such case for now to
1028+
// save compile-time.
1029+
int64_t CostLowerBound =
1030+
std::min((int64_t)INT_MAX,
1031+
(int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
10421032

1043-
// If suitable for a jump table, consider the cost for the table size and
1044-
// branch to destination.
1045-
if (JumpTableSize) {
1046-
int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1047-
4 * InlineConstants::InstrCost;
1048-
Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
1049-
return false;
1050-
}
1033+
if (CostLowerBound > Threshold) {
1034+
Cost = CostLowerBound;
1035+
return false;
1036+
}
10511037

1052-
// Considering forming a binary search, we should find the number of nodes
1053-
// which is same as the number of comparisons when lowered. For a given
1054-
// number of clusters, n, we can define a recursive function, f(n), to find
1055-
// the number of nodes in the tree. The recursion is :
1056-
// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1057-
// and f(n) = n, when n <= 3.
1058-
// This will lead a binary tree where the leaf should be either f(2) or f(3)
1059-
// when n > 3. So, the number of comparisons from leaves should be n, while
1060-
// the number of non-leaf should be :
1061-
// 2^(log2(n) - 1) - 1
1062-
// = 2^log2(n) * 2^-1 - 1
1063-
// = n / 2 - 1.
1064-
// Considering comparisons from leaf and non-leaf nodes, we can estimate the
1065-
// number of comparisons in a simple closed form :
1066-
// n + n / 2 - 1 = n * 3 / 2 - 1
1067-
if (NumCaseCluster <= 3) {
1068-
// Suppose a comparison includes one compare and one conditional branch.
1069-
Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1070-
return false;
1071-
}
1072-
int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
1073-
uint64_t SwitchCost =
1074-
ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1075-
Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
1038+
unsigned JumpTableSize = 0;
1039+
unsigned NumCaseCluster =
1040+
TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1041+
1042+
// If suitable for a jump table, consider the cost for the table size and
1043+
// branch to destination.
1044+
if (JumpTableSize) {
1045+
int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1046+
4 * InlineConstants::InstrCost;
1047+
Cost = std::min((int64_t)INT_MAX, JTCost + Cost);
10761048
return false;
10771049
}
10781050

1079-
// Use a simple switch cost model where we accumulate a cost proportional to
1080-
// the number of distinct successor blocks. This fan-out in the CFG cannot
1081-
// be represented for free even if we can represent the core switch as a
1082-
// jumptable that takes a single instruction.
1083-
///
1084-
// NB: We convert large switches which are just used to initialize large phi
1085-
// nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1086-
// inlining those. It will prevent inlining in cases where the optimization
1087-
// does not (yet) fire.
1088-
SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1089-
SuccessorBlocks.insert(SI.getDefaultDest());
1090-
for (auto Case : SI.cases())
1091-
SuccessorBlocks.insert(Case.getCaseSuccessor());
1092-
// Add cost corresponding to the number of distinct destinations. The first
1093-
// we model as free because of fallthrough.
1094-
Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1051+
// Considering forming a binary search, we should find the number of nodes
1052+
// which is same as the number of comparisons when lowered. For a given
1053+
// number of clusters, n, we can define a recursive function, f(n), to find
1054+
// the number of nodes in the tree. The recursion is :
1055+
// f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1056+
// and f(n) = n, when n <= 3.
1057+
// This will lead a binary tree where the leaf should be either f(2) or f(3)
1058+
// when n > 3. So, the number of comparisons from leaves should be n, while
1059+
// the number of non-leaf should be :
1060+
// 2^(log2(n) - 1) - 1
1061+
// = 2^log2(n) * 2^-1 - 1
1062+
// = n / 2 - 1.
1063+
// Considering comparisons from leaf and non-leaf nodes, we can estimate the
1064+
// number of comparisons in a simple closed form :
1065+
// n + n / 2 - 1 = n * 3 / 2 - 1
1066+
if (NumCaseCluster <= 3) {
1067+
// Suppose a comparison includes one compare and one conditional branch.
1068+
Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1069+
return false;
1070+
}
1071+
int64_t ExpectedNumberOfCompare = 3 * (uint64_t)NumCaseCluster / 2 - 1;
1072+
uint64_t SwitchCost =
1073+
ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1074+
Cost = std::min((uint64_t)INT_MAX, SwitchCost + Cost);
10951075
return false;
10961076
}
10971077

‎llvm/test/Transforms/Inline/AArch64/switch.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true | FileCheck %s
2-
; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux -inline-generic-switch-cost=true | FileCheck %s
1+
; RUN: opt < %s -inline -inline-threshold=20 -S -mtriple=aarch64-none-linux | FileCheck %s
2+
; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=20 -S -mtriple=aarch64-none-linux | FileCheck %s
33

44
define i32 @callee_range(i32 %a, i32* %P) {
55
switch i32 %a, label %sw.default [

0 commit comments

Comments
 (0)