diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -16,6 +16,7 @@ #include "bolt/Core/BinaryData.h" #include "bolt/Core/BinarySection.h" #include "bolt/Core/DebugData.h" +#include "bolt/Core/FunctionLayout.h" #include "bolt/Core/JumpTable.h" #include "bolt/Core/MCPlusBuilder.h" #include "bolt/RuntimeLibs/RuntimeLibrary.h" @@ -1169,11 +1170,8 @@ /// its code, and relaxing branch instructions. By default, branch /// instructions are updated to match the layout. Pass \p FixBranches set to /// false if the branches are known to be up to date with the code layout. - /// - /// Return the pair where the first size is for the main part, and the second - /// size is for the cold one. - std::pair calculateEmittedSize(BinaryFunction &BF, - bool FixBranches = true); + DenseMap calculateEmittedSize(BinaryFunction &BF, + bool FixBranches = true); /// Calculate the size of the instruction \p Inst optionally using a /// user-supplied emitter for lock-free multi-thread work. MCCodeEmitter is diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h --- a/bolt/include/bolt/Core/FunctionLayout.h +++ b/bolt/include/bolt/Core/FunctionLayout.h @@ -64,7 +64,30 @@ static constexpr FragmentNum main() { return FragmentNum(0); } static constexpr FragmentNum cold() { return FragmentNum(1); } }; +} // namespace bolt + +template <> struct DenseMapInfo { + using InnerType = DenseMapInfo; + + static inline bolt::FragmentNum getEmptyKey() { + return bolt::FragmentNum(InnerType::getEmptyKey()); + } + + static inline bolt::FragmentNum getTombstoneKey() { + return bolt::FragmentNum(InnerType::getTombstoneKey()); + } + static unsigned getHashValue(const bolt::FragmentNum &Val) { + return InnerType::getHashValue(Val.get()); + } + + static bool isEqual(const bolt::FragmentNum &LHS, + const bolt::FragmentNum &RHS) { + return InnerType::isEqual(LHS.get(), RHS.get()); + } +}; + +namespace bolt { /// A freestanding subset of contiguous blocks of a function. class FunctionFragment { using BasicBlockListType = SmallVector; diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h --- a/bolt/include/bolt/Passes/SplitFunctions.h +++ b/bolt/include/bolt/Passes/SplitFunctions.h @@ -9,11 +9,11 @@ #ifndef BOLT_PASSES_SPLIT_FUNCTIONS_H #define BOLT_PASSES_SPLIT_FUNCTIONS_H +#include "bolt/Core/BinaryFunction.h" #include "bolt/Core/FunctionLayout.h" #include "bolt/Passes/BinaryPasses.h" #include "llvm/ADT/Hashing.h" #include "llvm/Support/CommandLine.h" -#include namespace llvm { namespace bolt { @@ -90,9 +90,6 @@ mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout, const TrampolineSetType &Trampolines) const; - std::atomic SplitBytesHot{0ull}; - std::atomic SplitBytesCold{0ull}; - public: explicit SplitFunctions(const cl::opt &PrintPass) : BinaryFunctionPass(PrintPass) {} diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -13,6 +13,7 @@ #include "bolt/Core/BinaryContext.h" #include "bolt/Core/BinaryEmitter.h" #include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/FunctionLayout.h" #include "bolt/Utils/CommandLineOpts.h" #include "bolt/Utils/NameResolver.h" #include "bolt/Utils/Utils.h" @@ -2165,7 +2166,7 @@ return BF; } -std::pair +DenseMap BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { // Adjust branch instruction to match the current layout. if (FixBranches) @@ -2234,14 +2235,16 @@ MCAsmLayout Layout(Assembler); Assembler.layout(Layout); - const uint64_t HotSize = + DenseMap Result; + Result[FragmentNum::main()] = Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel); - const uint64_t ColdSize = - std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL, - [&](const uint64_t Accu, const LabelRange &Labels) { - return Accu + Layout.getSymbolOffset(*Labels.second) - - Layout.getSymbolOffset(*Labels.first); - }); + for (const auto &FF : enumerate(BF.getLayout().getSplitFragments())) { + const MCSymbol *const SplitStartLabel = SplitLabels[FF.index()].first; + const MCSymbol *const SplitEndLabel = SplitLabels[FF.index()].second; + Result[FF.value().getFragmentNum()] = + Layout.getSymbolOffset(*SplitEndLabel) - + Layout.getSymbolOffset(*SplitStartLabel); + } // Clean-up the effect of the code emission. for (const MCSymbol &Symbol : Assembler.symbols()) { @@ -2250,7 +2253,7 @@ MutableSymbol->setIsRegistered(false); } - return std::make_pair(HotSize, ColdSize); + return Result; } bool BinaryContext::validateEncoding(const MCInst &Inst, diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -565,10 +565,9 @@ // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit // incorrect debug info. ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { - uint64_t HotSize, ColdSize; - std::tie(HotSize, ColdSize) = + DenseMap FragmentSizes = BC.calculateEmittedSize(BF, /*FixBranches=*/false); - if (HotSize > BF.getMaxSize()) + if (FragmentSizes.lookup(FragmentNum::main()) > BF.getMaxSize()) BF.setSimple(false); }; diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp --- a/bolt/lib/Passes/PatchEntries.cpp +++ b/bolt/lib/Passes/PatchEntries.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Passes/PatchEntries.h" +#include "bolt/Core/FunctionLayout.h" #include "bolt/Utils/NameResolver.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" @@ -119,10 +120,13 @@ PatchFunction->addBasicBlock()->addInstructions(Seq); // Verify the size requirements. - uint64_t HotSize, ColdSize; - std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(*PatchFunction); - assert(!ColdSize && "unexpected cold code"); - assert(HotSize <= PatchSize && "max patch size exceeded"); + DenseMap FragmentSizes = + BC.calculateEmittedSize(*PatchFunction); + assert(FragmentSizes.count(FragmentNum::main()) && + "missing main fragment"); + assert(FragmentSizes.size() == 1 && "unexpected cold code"); + assert(FragmentSizes.lookup(FragmentNum::main()) <= PatchSize && + "max patch size exceeded"); } Function.setIsPatched(true); diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -16,7 +16,6 @@ #include "bolt/Core/FunctionLayout.h" #include "bolt/Core/ParallelUtilities.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/CommandLine.h" @@ -228,6 +227,40 @@ BB->setFragmentNum(FragmentNum(Fragment++)); } }; + +std::string +formatFragmentStats(const DenseMap &FragmentSizes) { + if (FragmentSizes.empty()) + return ""; + + const size_t TotalSize = std::accumulate( + FragmentSizes.begin(), FragmentSizes.end(), (size_t)0, + [](const double Accu, const std::pair &FSize) { + return Accu + FSize.second; + }); + const double TotalSizeFP = static_cast(TotalSize); + + SmallVector, 3> Stats; + + const size_t MainSize = FragmentSizes.lookup(FragmentNum::main()); + const size_t F1Size = FragmentSizes.lookup(FragmentNum(1)); + + Stats.emplace_back("hot", MainSize); + if (FragmentSizes.size() == 2) { + Stats.emplace_back("cold", F1Size); + } else if (FragmentSizes.size() > 2) { + const size_t NonHotSize = TotalSize - MainSize; + Stats.emplace_back("non-hot", NonHotSize); + } + + SmallVector, 3> FormattedStats; + for (const std::pair &Entry : Stats) + FormattedStats.emplace_back(formatv("{0}B {1} ({2:p2})", Entry.second, + Entry.first, + Entry.second / TotalSizeFP)); + + return llvm::join(FormattedStats, ", "); +} } // namespace namespace llvm { @@ -277,11 +310,20 @@ [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, "SplitFunctions", ForceSequential); - if (SplitBytesHot + SplitBytesCold > 0) - outs() << "BOLT-INFO: splitting separates " << SplitBytesHot - << " hot bytes from " << SplitBytesCold << " cold bytes " - << format("(%.2lf%% of split functions is hot).\n", - 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); + DenseMap TotalFragmentSizes; + for (BinaryFunction *const BF : BC.getAllBinaryFunctions()) { + const DenseMap FunctionFragmentSizes = + BC.calculateEmittedSize(*BF, /*FixBranches=*/false); + for (const std::pair &Pair : FunctionFragmentSizes) { + const FragmentNum Frag = Pair.first; + const size_t Size = Pair.second; + TotalFragmentSizes[Frag] += Size; + } + } + + outs() << formatv("BOLT-INFO: splitting generated {0} fragments with {1}\n", + TotalFragmentSizes.size(), + formatFragmentStats(TotalFragmentSizes)); } void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { @@ -296,15 +338,15 @@ Layout.block_end()); BinaryContext &BC = BF.getBinaryContext(); - size_t OriginalHotSize; - size_t HotSize; - size_t ColdSize; + size_t OriginalHotSize = 0; if (BC.isX86()) { - std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); - LLVM_DEBUG(dbgs() << "Estimated size for function " << BF - << " pre-split is <0x" - << Twine::utohexstr(OriginalHotSize) << ", 0x" - << Twine::utohexstr(ColdSize) << ">\n"); + const DenseMap OriginalFragmentSizes = + BC.calculateEmittedSize(BF); + OriginalHotSize = OriginalFragmentSizes.lookup(FragmentNum::main()); + LLVM_DEBUG( + dbgs() << formatv( + "BOLT-DEBUG: estimated size for function {0} pre-split is {1}\n", + BF, formatFragmentStats(OriginalFragmentSizes))); } BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), @@ -404,16 +446,19 @@ // Check the new size to see if it's worth splitting the function. if (BC.isX86() && BF.isSplit()) { - std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); - LLVM_DEBUG(dbgs() << "Estimated size for function " << BF - << " post-split is <0x" << Twine::utohexstr(HotSize) - << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); + const DenseMap FragmentSizes = + BC.calculateEmittedSize(BF); + const size_t NewHotSize = FragmentSizes.lookup(FragmentNum::main()); + LLVM_DEBUG( + dbgs() << formatv( + "BOLT-DEBUG: estimated size for function {0} post-split is {1}\n", + BF, formatFragmentStats(FragmentSizes))); if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= - alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { - LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n 0x" - << Twine::utohexstr(HotSize) << ", 0x" - << Twine::utohexstr(ColdSize) << " -> 0x" - << Twine::utohexstr(OriginalHotSize) << '\n'); + alignTo(NewHotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { + LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: reversing splitting of " + "function {0}:\n {1} -> {2:x+}\n", + BF, formatFragmentStats(FragmentSizes), + OriginalHotSize)); // Reverse the action of createEHTrampolines(). The trampolines will be // placed immediately before the matching destination resulting in no @@ -424,9 +469,6 @@ for (BinaryBasicBlock &BB : BF) BB.setFragmentNum(FragmentNum::main()); BF.getLayout().update(PreSplitLayout); - } else { - SplitBytesHot += HotSize; - SplitBytesCold += ColdSize; } } }