diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h --- a/bolt/include/bolt/Passes/SplitFunctions.h +++ b/bolt/include/bolt/Passes/SplitFunctions.h @@ -23,6 +23,10 @@ /// Split each function into a hot and cold fragment using profiling /// information. Profile2 = 0, + /// Split each function into a hot, warm and cold fragments using profiling + /// information. This is best used in combination with + /// `--reorder-blocker=ext-tsp`. + Profile3, /// Split each function into a hot and cold fragment at a randomly chosen /// split point (ignoring any available profiling information). Random2, @@ -41,7 +45,8 @@ virtual ~SplitStrategy() = default; virtual bool canSplit(const BinaryFunction &BF) = 0; virtual bool keepEmpty() = 0; - virtual void fragment(const BlockIt Start, const BlockIt End) = 0; + virtual void fragment(const BinaryFunction &BF, const BlockIt Start, + const BlockIt End) = 0; }; /// Split function code in multiple parts. diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormatVariadic.h" #include @@ -90,6 +91,11 @@ cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", "split each function into a hot and cold fragment " "using profiling information")), + cl::values(clEnumValN( + SplitFunctionsStrategy::Profile3, "profile3", + "split each function into a hot, warm and cold fragments using " + "profiling information. This is best used in combination with " + "--reorder-blocker=ext-tsp (experimental)")), cl::values(clEnumValN( SplitFunctionsStrategy::Random2, "random2", "split each function into a hot and cold fragment at a randomly chosen " @@ -104,6 +110,12 @@ "fragment contains exactly a single basic block")), cl::desc("strategy used to partition blocks into fragments"), cl::cat(BoltOptCategory)); + +static cl::opt SplitSizeCost( + "split-size-cost", + cl::desc("change the influence of the size of a block on it being moved " + "into a warm fragment (when --split-strategy=profile3)"), + cl::init(0.01), cl::cat(BoltOptCategory)); } // namespace opts namespace { @@ -126,7 +138,8 @@ bool keepEmpty() override { return false; } - void fragment(const BlockIt Start, const BlockIt End) override { + void fragment(const BinaryFunction &BF, const BlockIt Start, + const BlockIt End) override { for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { if (BB->getExecutionCount() == 0) BB->setFragmentNum(FragmentNum::cold()); @@ -134,6 +147,224 @@ } }; +/// This strategy tries to partition the function into three fragments, where +/// one fragment contains hot code, one fragments contains cold code, and one +/// fragments contains not-so-hot (or warm) code that is also not cold code. It +/// is not the goal to improve caching behavior of the local function, because +/// when block-level reordering is applied, the warm part already sits right +/// between the hot and cold parts. Instead, the target is to eliminate the +/// effect that the presence of warm code has on other functions in the hot +/// section while minimizing the potential cost of branching from hot code to +/// far-away located warm code. +/// +/// This implementation minimizes the cost function λ*s+b, where λ is a constant +/// parameter, s is the size of each block in the hot fragment and b the +/// frequency of branches taken from the hot to the warm fragment. It relies on +/// blocks being ordered such that hotness is approximately decreasing (e.g. +/// ext-tsp) and determines the optimal split point by calculating the cost at +/// every block. All blocks before that point are assigned the hot fragment, all +/// other blocks are either warm or cold. +struct SplitProfile3 final : public SplitStrategy { + BinaryContext &BC; + double MedianExecutionCount; + + explicit SplitProfile3(BinaryContext &BC) + : BC(BC), MedianExecutionCount(caluclateMedianExecutionCount(BC)) {} + + bool canSplit(const BinaryFunction &BF) override { + return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); + } + + bool keepEmpty() override { + // If there are no blocks in the warm fragment, do not get rid of it. + return true; + } + + void fragment(const BinaryFunction &BF, const BlockIt Start, + const BlockIt End) override { + BinaryContext::IndependentCodeEmitter Emitter; + if (!opts::NoThreads) + Emitter = BC.createIndependentMCCodeEmitter(); + + DenseMap Size; + for (const BinaryBasicBlock *const BB : make_range(Start, End)) + Size[BB] = BB->estimateSize(Emitter.MCE.get()); + + // Initially, assume that the function is split right at the start, so all + // blocks are assigned to the warm (or cold) fragment. + BlockIt BestSplit = Start; + + // Try to adjust size cost by block frequency. Using the same factor for + // almost cold functions as for super hot function will have the effect, + // that the almost cold functions will be entirely moved into the warm + // fragment - except their entry block (which is a current limitations). If + // those warm functions are clustered with other warm functions, moving + // everything but the entry into warm might hurt performance (jumping back + // and forth between hot and warm at each call). Hence, for functions that + // are less frequently called, reduce the size cost to split these less + // aggressively. + double SizeCost = + (1.0 - 1.0 / (1.0 + BF.getExecutionCount() / MedianExecutionCount)) * + opts::SplitSizeCost; + double CurrentCost = 0.0; + + // Estimate fix cost for all non-outlineable blocks. These blocks are always + // hot regardless of where the function is split. Affects only entry points + // and exception landing pads (if no --split-eh). If the entry point is + // outlineable at some point in the future, add incoming cost based on its + // execution count to model cost of it being in the warm fragment (and + // being presumably called from hot blocks). + assert(!(*Start)->canOutline() && + "First block is assumed to be not outlineable"); + for (BlockIt It = Start; It != End; ++It) { + const BinaryBasicBlock *const BB = *It; + + if (!BB->canOutline()) { + CurrentCost += Size[BB] * SizeCost; + + for (const BinaryBasicBlock *const Predecessor : BB->predecessors()) { + // Add cost of all branches from predecessors that are outlineable. + if (Predecessor->getLayoutIndex() < BB->getLayoutIndex() && + Predecessor->canOutline()) { + const BinaryBasicBlock::BinaryBranchInfo &Branch = + Predecessor->getBranchInfo(*BB); + CurrentCost += Branch.Count; + } + } + + for (const auto Pair : zip(BB->successors(), BB->branch_info())) { + const BinaryBasicBlock *const Successor = std::get<0>(Pair); + const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); + + // Add cost of all branches to successors that are outlineable. + if (Successor->getLayoutIndex() > BB->getLayoutIndex() && + Successor->canOutline()) + CurrentCost += Branch.Count; + } + } else { + // Entry blocks are (usually) in hot. Add some cost on a call from warm + // to hot. + for (const MCInst &I : *BB) { + if (BC.MIB->isCall(I)) + CurrentCost += BB->getExecutionCount() / 2.0; + } + } + } + + double BestCost = CurrentCost; + + // Now estimate the cost at each possible hot/warm splitpoint. + for (BlockIt It = Start; It != End; ++It) { + const BinaryBasicBlock *const BB = *It; + + // If the block is not outlineable, its cost was already included in the + // initial estimation above. + if (BB->canOutline()) { + // Splitting after BB adds accumulated cost proportional to its weight. + CurrentCost += Size[BB] * SizeCost; + + // If hot predecessors previously assumed this block would be warm (and + // thus cut the edge), subtract the added cost from those edges. + for (const BinaryBasicBlock *const Predecessor : BB->predecessors()) { + if (Predecessor->getLayoutIndex() < BB->getLayoutIndex()) { + const BinaryBasicBlock::BinaryBranchInfo &Branch = + Predecessor->getBranchInfo(*BB); + CurrentCost -= Branch.Count; + } + } + + // If the cut is made after this block, add the cost of cutting all + // outgoing edges to non-hot blocks. + for (const auto Pair : zip(BB->successors(), BB->branch_info())) { + const BinaryBasicBlock *const Successor = std::get<0>(Pair); + const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); + + if (Successor->getLayoutIndex() > BB->getLayoutIndex()) { + // If the successor is outlineable, it is currently considered warm + // (while BB is hot) and the cost of that edge is added. If the + // successor is not outlineable however, remove the previously added + // cost. + // + // Subtracting cost could split a chain of blocks where some of them + // are outlineable and some of them are not (e.g. `not outlineable + // -> outlineable -> not outlineable -> huge outlineable` could + // split after the second not outlineable block). In this particular + // case, that is fine, because jumping back and forth between hot + // and warm is not desired while it still may be beneficial to cut + // of a huge tail. In practice, this pattern should not appear, as + // only exception landing pads and function entry points cannot be + // outlined. + if (BB->canOutline()) + CurrentCost += Branch.Count; + else + CurrentCost -= Branch.Count; + } + } + + // Now it's a call hot -> hot. + for (const MCInst &I : *BB) { + if (BC.MIB->isCall(I)) + CurrentCost -= BB->getExecutionCount() / 2.0; + } + } + + // Did the cost improve? If the cost goes down or stays the same, put the + // current block in hot. + if (CurrentCost <= BestCost) { + BestSplit = std::next(It); + BestCost = CurrentCost; + } + } + + // Regardless of the split point, assign not executed blocks to the cold + // fragment. If --split-all-cold is set, they will actually be moved to the + // cold fragment. If not, they will remain in their respective hot/warm + // fragment. + const FragmentNum Warm(1); + const FragmentNum Cold(2); + for (BinaryBasicBlock *const BB : make_range(Start, BestSplit)) { + if (BB->getExecutionCount() > 0) + BB->setFragmentNum(FragmentNum::main()); + else + BB->setFragmentNum(Cold); + } + + for (BinaryBasicBlock *const BB : reverse(make_range(BestSplit, End))) { + if (BB->getExecutionCount() > 0) + BB->setFragmentNum(Warm); + else + BB->setFragmentNum(Cold); + } + } + +private: + static double caluclateMedianExecutionCount(const BinaryContext &BC) { + SmallVector CallFrequencies; + llvm::transform(BC.getBinaryFunctions(), + std::back_inserter(CallFrequencies), + [](const std::pair &BF) { + return BF.second.getExecutionCount(); + }); + llvm::erase_value(CallFrequencies, BinaryFunction::COUNT_NO_PROFILE); + llvm::erase_value(CallFrequencies, 0); + llvm::sort(CallFrequencies); + + // If there is no profile, setting size cost to zero will keep blocks in + // main fragment. + double MedianCallFrequency = 0.0; + if (!CallFrequencies.empty()) { + MedianCallFrequency = CallFrequencies[CallFrequencies.size() / 2]; + if (CallFrequencies.size() % 2 == 0) + MedianCallFrequency = + (MedianCallFrequency + + CallFrequencies[CallFrequencies.size() / 2 - 1]) / + 2.0; + } + + return 1.0 / MedianCallFrequency; + } +}; + struct SplitRandom2 final : public SplitStrategy { std::minstd_rand0 Gen; @@ -143,7 +374,8 @@ bool keepEmpty() override { return false; } - void fragment(const BlockIt Start, const BlockIt End) override { + void fragment(const BinaryFunction &BF, const BlockIt Start, + const BlockIt End) override { using DiffT = typename std::iterator_traits::difference_type; const DiffT NumBlocks = End - Start; assert(NumBlocks > 0 && "Cannot fragment empty function"); @@ -170,7 +402,8 @@ bool keepEmpty() override { return false; } - void fragment(const BlockIt Start, const BlockIt End) override { + void fragment(const BinaryFunction &BF, const BlockIt Start, + const BlockIt End) override { using DiffT = typename std::iterator_traits::difference_type; const DiffT NumBlocks = End - Start; assert(NumBlocks > 0 && "Cannot fragment empty function"); @@ -221,7 +454,8 @@ return true; } - void fragment(const BlockIt Start, const BlockIt End) override { + void fragment(const BinaryFunction &BF, const BlockIt Start, + const BlockIt End) override { unsigned Fragment = 0; for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) BB->setFragmentNum(FragmentNum(Fragment++)); @@ -244,11 +478,15 @@ const size_t MainSize = FragmentSizes.lookup(FragmentNum::main()); const size_t F1Size = FragmentSizes.lookup(FragmentNum(1)); + const size_t F2Size = FragmentSizes.lookup(FragmentNum(2)); Stats.emplace_back("hot", MainSize); if (FragmentSizes.size() == 2) { Stats.emplace_back("cold", F1Size); - } else if (FragmentSizes.size() > 2) { + } else if (FragmentSizes.size() == 3) { + Stats.emplace_back("warm", F1Size); + Stats.emplace_back("cold", F2Size); + } else if (FragmentSizes.size() > 3) { const size_t NonHotSize = TotalSize - MainSize; Stats.emplace_back("non-hot", NonHotSize); } @@ -285,6 +523,9 @@ case SplitFunctionsStrategy::Profile2: Strategy = std::make_unique(); break; + case SplitFunctionsStrategy::Profile3: + Strategy = std::make_unique(BC); + break; case SplitFunctionsStrategy::Random2: Strategy = std::make_unique(); // If we split functions randomly, we need to ensure that across runs with @@ -385,7 +626,7 @@ } BF.getLayout().updateLayoutIndices(); - S.fragment(NewLayout.begin(), NewLayout.end()); + S.fragment(BF, NewLayout.begin(), NewLayout.end()); // Make sure all non-outlineable blocks are in the main-fragment. for (BinaryBasicBlock *const BB : NewLayout) { diff --git a/bolt/test/X86/split-profile3-chain.s b/bolt/test/X86/split-profile3-chain.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/split-profile3-chain.s @@ -0,0 +1,101 @@ +# Test that chain of outlineable blocks is not split + +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=profile3 \ +# RUN: --split-size-cost=0.2 --print-split --print-only=chain \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck --check-prefix=NOSPLIT %s +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=profile3 \ +# RUN: --split-size-cost=0.3 --print-split --print-only=chain \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck --check-prefix=SPLIT %s + +# NOSPLIT: Binary Function "chain" +# NOSPLIT: IsSplit : +# NOSPLIT-SAME: {{ 0$}} + +# SPLIT: Binary Function "chain" +# SPLIT: IsSplit : +# SPLIT-SAME: {{ 1$}} +# SPLIT: {{^\.LBB00}} +# SPLIT: {{^\.LFT0}} +# SPLIT: {{^\.Ltmp1}} +# SPLIT: ------- HOT-COLD SPLIT POINT ------- + + .text + .globl chain + .type chain, @function +chain: + pushq %rbp + movq %rsp, %rbp + cmpl $2, %edi +LLentry_LLchain_start: + jge LLchain_start +# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10 +# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 50 +LLfast: + movl $5, %eax +LLfast_LLexit: + jmp LLexit +# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 50 +LLchain_start: + movl $10, %eax +LLchain_start_LLchain1: + jmp LLchain1 +# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10 +LLchain1: + addl $1, %eax +LLchain1_LLchain2: + jmp LLchain2 +# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10 +LLchain2: + addl $1, %eax +LLchain2_LLchain3: + jmp LLchain3 +# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10 +LLchain3: + addl $1, %eax + addl $1, %eax + addl $1, %eax + addl $1, %eax + addl $1, %eax +LLchain3_LLchain4: + jmp LLchain4 +# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10 +LLchain4: + addl $1, %eax + addl $1, %eax + addl $1, %eax + addl $1, %eax + addl $1, %eax +LLchain4_LLexit: + jmp LLexit +# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10 +LLexit: + popq %rbp + ret +LLchain_end: + .size chain, LLchain_end-chain + + + .globl main + .type main, @function +main: + pushq %rbp + movq %rsp, %rbp + movl $1, %edi +LLmain_chain1: + call chain +# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 50 + movl $4, %edi +LLmain_chain2: + call chain +# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10 + xorl %eax, %eax + popq %rbp + retq +.Lmain_end: + .size main, .Lmain_end-main diff --git a/bolt/test/X86/split-profile3-empty-warm.s b/bolt/test/X86/split-profile3-empty-warm.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/split-profile3-empty-warm.s @@ -0,0 +1,57 @@ +# Test that 3-way splitting does not remove warm fragment when empty + +# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=profile3 \ +# RUN: --split-size-cost=0.1 --print-split --print-only=foo \ +# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \ +# RUN: 2>&1 | FileCheck %s + +# CHECK: Binary Function "foo" +# CHECK: IsSplit : +# CHECK-SAME: {{ 1$}} +# CHECK: ------- HOT-COLD SPLIT POINT ------- +# CHECK: ------- HOT-COLD SPLIT POINT ------- + + .text + .globl foo + .type foo, @function +foo: + pushq %rbp + movq %rsp, %rbp + cmpl $2, %edi +LLentry_LLcold: + jge LLcold +# FDATA: 1 foo #LLentry_LLcold# 1 foo #LLhot# 0 50 +LLhot: + movl $5, %eax +LLhot_LLexit: + jmp LLexit +# FDATA: 1 foo #LLhot_LLexit# 1 foo #LLexit# 0 50 +LLcold: + movl $10, %eax + jmp LLexit +LLexit: + popq %rbp + ret +LLfoo_end: + .size foo, LLfoo_end-foo + + + .globl main + .type main, @function +main: + pushq %rbp + movq %rsp, %rbp + movl $1, %edi +LLmain_foo: + call foo +# FDATA: 1 main #LLmain_foo# 1 foo 0 0 50 + movl $4, %edi + xorl %eax, %eax + popq %rbp + retq +.Lmain_end: + .size main, .Lmain_end-main