diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -638,9 +638,22 @@ /// Total hotness score according to profiling data for this binary. uint64_t TotalScore{0}; - /// Binary-wide stats for macro-fusion. - uint64_t MissedMacroFusionPairs{0}; - uint64_t MissedMacroFusionExecCount{0}; + /// Binary-wide aggregated stats. + struct BinaryStats { + /// Stats for macro-fusion. + uint64_t MissedMacroFusionPairs{0}; + uint64_t MissedMacroFusionExecCount{0}; + + /// Stats for stale profile matching: + /// the total number of basic blocks in the profile + uint32_t NumStaleBlocks{0}; + /// the number matched basic blocks + uint32_t NumMatchedBlocks{0}; + /// the total count of samples in the profile + uint64_t StaleSampleCount{0}; + /// the count matched samples + uint64_t MatchedSampleCount{0}; + } Stats; // Address of the first allocated segment. uint64_t FirstAllocAddress{std::numeric_limits::max()}; diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -381,7 +381,7 @@ /// Profile match ratio. float ProfileMatchRatio{0.0f}; - /// Raw branch count for this function in the profile + /// Raw branch count for this function in the profile. uint64_t RawBranchCount{0}; /// Indicates the type of profile the function is using. diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -2217,8 +2217,8 @@ << Twine::utohexstr(getAddress() + Offset) << " in function " << *this << "; executed " << BB.getKnownExecutionCount() << " times.\n"); - ++BC.MissedMacroFusionPairs; - BC.MissedMacroFusionExecCount += BB.getKnownExecutionCount(); + ++BC.Stats.MissedMacroFusionPairs; + BC.Stats.MissedMacroFusionExecCount += BB.getKnownExecutionCount(); } } diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1454,6 +1454,14 @@ 100.0 * NumInferredFunctions / NumAllStaleFunctions, 100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount, TotalSampleCount); + outs() << format( + "BOLT-INFO: stale inference matched %.2f%% of basic blocks" + " (%zu out of %zu stale) responsible for %.2f%% samples" + " (%zu out of %zu stale)\n", + 100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks, + BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks, + 100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount, + BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount); } if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) { @@ -1562,10 +1570,11 @@ } // Print information on missed macro-fusion opportunities seen on input. - if (BC.MissedMacroFusionPairs) { - outs() << "BOLT-INFO: the input contains " << BC.MissedMacroFusionPairs - << " (dynamic count : " << BC.MissedMacroFusionExecCount - << ") opportunities for macro-fusion optimization"; + if (BC.Stats.MissedMacroFusionPairs) { + outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)" + " opportunities for macro-fusion optimization", + BC.Stats.MissedMacroFusionPairs, + BC.Stats.MissedMacroFusionExecCount); switch (opts::AlignMacroOpFusion) { case MFT_NONE: outs() << ". Use -align-macro-fusion to fix.\n"; diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -27,17 +27,18 @@ #include "bolt/Core/HashUtilities.h" #include "bolt/Profile/YAMLProfileReader.h" +#include "llvm/ADT/Bitfields.h" #include "llvm/ADT/Hashing.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SampleProfileInference.h" #include +using namespace llvm; + #undef DEBUG_TYPE #define DEBUG_TYPE "bolt-prof" -using namespace llvm; - namespace opts { extern cl::OptionCategory BoltOptCategory; @@ -141,49 +142,29 @@ /// components are of smaller size (e.g., uint16_t or uint8_t). struct BlendedBlockHash { private: - static uint64_t combineHashes(uint16_t Hash1, uint16_t Hash2, uint16_t Hash3, - uint16_t Hash4) { - uint64_t Hash = 0; - - Hash |= uint64_t(Hash4); - Hash <<= 16; - - Hash |= uint64_t(Hash3); - Hash <<= 16; - - Hash |= uint64_t(Hash2); - Hash <<= 16; - - Hash |= uint64_t(Hash1); - - return Hash; - } - - static void parseHashes(uint64_t Hash, uint16_t &Hash1, uint16_t &Hash2, - uint16_t &Hash3, uint16_t &Hash4) { - Hash1 = Hash & 0xffff; - Hash >>= 16; - - Hash2 = Hash & 0xffff; - Hash >>= 16; - - Hash3 = Hash & 0xffff; - Hash >>= 16; - - Hash4 = Hash & 0xffff; - Hash >>= 16; - } + using ValueOffset = Bitfield::Element; + using ValueOpcode = Bitfield::Element; + using ValueInstr = Bitfield::Element; + using ValueNeighbor = Bitfield::Element; public: explicit BlendedBlockHash() {} - explicit BlendedBlockHash(uint64_t CombinedHash) { - parseHashes(CombinedHash, Offset, OpcodeHash, InstrHash, NeighborHash); + explicit BlendedBlockHash(uint64_t Hash) { + Offset = Bitfield::get(Hash); + OpcodeHash = Bitfield::get(Hash); + InstrHash = Bitfield::get(Hash); + NeighborHash = Bitfield::get(Hash); } /// Combine the blended hash into uint64_t. uint64_t combine() const { - return combineHashes(Offset, OpcodeHash, InstrHash, NeighborHash); + uint64_t Hash = 0; + Bitfield::set(Hash, Offset); + Bitfield::set(Hash, OpcodeHash); + Bitfield::set(Hash, InstrHash); + Bitfield::set(Hash, NeighborHash); + return Hash; } /// Compute a distance between two given blended hashes. The smaller the @@ -236,9 +217,8 @@ /// Find the most similar block for a given hash. const FlowBlock *matchBlock(BlendedBlockHash BlendedHash) const { auto BlockIt = OpHashToBlocks.find(BlendedHash.OpcodeHash); - if (BlockIt == OpHashToBlocks.end()) { + if (BlockIt == OpHashToBlocks.end()) return nullptr; - } FlowBlock *BestBlock = nullptr; uint64_t BestDist = std::numeric_limits::max(); for (auto It : BlockIt->second) { @@ -306,6 +286,7 @@ BB->setHash(BlendedHashes[I].combine()); } } + /// Create a wrapper flow function to use with the profile inference algorithm, /// and initialize its jumps and metadata. FlowFunction @@ -314,7 +295,7 @@ // Add a special "dummy" source so that there is always a unique entry point. // Because of the extra source, for all other blocks in FlowFunction it holds - // that Block.Index == BB->getLayoutIndex() + 1 + // that Block.Index == BB->getIndex() + 1 FlowBlock EntryBlock; EntryBlock.Index = 0; Func.Blocks.push_back(EntryBlock); @@ -325,7 +306,7 @@ FlowBlock &Block = Func.Blocks.back(); Block.Index = Func.Blocks.size() - 1; (void)BB; - assert(Block.Index == BB->getLayoutIndex() + 1 && + assert(Block.Index == BB->getIndex() + 1 && "incorrectly assigned basic block index"); } @@ -341,8 +322,8 @@ Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = SrcBB->getLayoutIndex() + 1; - Jump.Target = DstBB->getLayoutIndex() + 1; + Jump.Source = SrcBB->getIndex() + 1; + Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } @@ -354,8 +335,8 @@ Func.Jumps.emplace_back(); FlowJump &Jump = Func.Jumps.back(); - Jump.Source = SrcBB->getLayoutIndex() + 1; - Jump.Target = DstBB->getLayoutIndex() + 1; + Jump.Source = SrcBB->getIndex() + 1; + Jump.Target = DstBB->getIndex() + 1; InDegree[Jump.Target]++; UniqueSuccs.insert(DstBB); } @@ -393,20 +374,23 @@ /// of the basic blocks in the binary, the count is "matched" to the block. /// Similarly, if both the source and the target of a count in the profile are /// matched to a jump in the binary, the count is recorded in CFG. -void matchWeightsByHashes(const BinaryFunction::BasicBlockOrderType &BlockOrder, +void matchWeightsByHashes(BinaryContext &BC, + const BinaryFunction::BasicBlockOrderType &BlockOrder, const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func) { assert(Func.Blocks.size() == BlockOrder.size() + 1); std::vector Blocks; std::vector BlendedHashes; + std::unordered_set AllInstrHashes; for (uint64_t I = 0; I < BlockOrder.size(); I++) { const BinaryBasicBlock *BB = BlockOrder[I]; assert(BB->getHash() != 0 && "empty hash of BinaryBasicBlock"); Blocks.push_back(&Func.Blocks[I + 1]); BlendedBlockHash BlendedHash(BB->getHash()); BlendedHashes.push_back(BlendedHash); - LLVM_DEBUG(dbgs() << "BB with index " << I << " has hash = " + AllInstrHashes.insert(BlendedHash.InstrHash); + LLVM_DEBUG(dbgs() << "\tBB with index " << I << " has hash = " << Twine::utohexstr(BB->getHash()) << "\n"); } StaleMatcher Matcher; @@ -421,15 +405,23 @@ const FlowBlock *MatchedBlock = Matcher.matchBlock(BlendedHash); if (MatchedBlock != nullptr) { MatchedBlocks[YamlBB.Index] = MatchedBlock; - LLVM_DEBUG(dbgs() << "Matched yaml block with bid = " << YamlBB.Index + LLVM_DEBUG(dbgs() << "\tMatched yaml block with bid = " << YamlBB.Index << " and hash = " << Twine::utohexstr(YamlBB.Hash) << " to BB with index = " << MatchedBlock->Index - 1 << "\n"); } else { LLVM_DEBUG( - dbgs() << "Couldn't match yaml block with bid = " << YamlBB.Index + dbgs() << "\tCouldn't match yaml block with bid = " << YamlBB.Index << " and hash = " << Twine::utohexstr(YamlBB.Hash) << "\n"); } + + // Update matching stats + ++BC.Stats.NumStaleBlocks; + BC.Stats.StaleSampleCount += YamlBB.ExecCount; + if (AllInstrHashes.find(BlendedHash.InstrHash) != AllInstrHashes.end()) { + ++BC.Stats.NumMatchedBlocks; + BC.Stats.MatchedSampleCount += YamlBB.ExecCount; + } } // Match jumps from the profile to the jumps from CFG @@ -475,7 +467,7 @@ // Assign block counts based on in-/out- jumps for (FlowBlock &Block : Func.Blocks) { if (OutWeight[Block.Index] == 0 && InWeight[Block.Index] == 0) { - assert(Block.HasUnknownWeight && "unmatched block with positive count"); + assert(Block.HasUnknownWeight && "unmatched block with a positive count"); continue; } Block.HasUnknownWeight = false; @@ -691,31 +683,33 @@ bool YAMLProfileReader::inferStaleProfile( BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) { - // Make sure that block indices and hashes are up to date - BF.getLayout().updateLayoutIndices(); + LLVM_DEBUG(dbgs() << "BOLT-INFO: applying profile inference for " + << "\"" << BF.getPrintName() << "\"\n"); + + // Make sure that block hashes are up to date. BF.computeBlockHashes(); const BinaryFunction::BasicBlockOrderType BlockOrder( BF.getLayout().block_begin(), BF.getLayout().block_end()); - // Create a wrapper flow function to use with the profile inference algorithm + // Create a wrapper flow function to use with the profile inference algorithm. FlowFunction Func = createFlowFunction(BlockOrder); - // Match as many block/jump counts from the stale profile as possible - matchWeightsByHashes(BlockOrder, YamlBF, Func); + // Match as many block/jump counts from the stale profile as possible. + matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func); // Adjust the flow function by marking unreachable blocks Unlikely so that - // they don't get any counts assigned + // they don't get any counts assigned. preprocessUnreachableBlocks(Func); - // Check if profile inference can be applied for the instance + // Check if profile inference can be applied for the instance. if (!canApplyInference(Func)) return false; - // Apply the profile inference algorithm + // Apply the profile inference algorithm. applyInference(Func); - // Collect inferred counts and update function annotations + // Collect inferred counts and update function annotations. assignProfile(BF, BlockOrder, Func); // As of now, we always mark the binary function having "correct" profile. diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -249,9 +249,6 @@ << " edges in profile did not match function " << BF << '\n'; if (!ProfileMatched && opts::InferStaleProfile) { - if (opts::Verbosity >= 1) - outs() << "BOLT-INFO: applying profile inference for " - << "\"" << BF.getPrintName() << "\"\n"; if (inferStaleProfile(BF, YamlBF)) { ProfileMatched = true; BF.markProfiled(YamlBP.Header.Flags);