Index: llvm/lib/Transforms/IPO/HotColdSplitting.cpp =================================================================== --- llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -135,11 +136,14 @@ return !BB.hasAddressTaken(); } -/// Check whether \p BB is profitable to outline (i.e. its code size cost meets -/// the threshold set in \p MinOutliningThreshold). -static bool isProfitableToOutline(const BasicBlock &BB, +/// Check whether \p Region is profitable to outline. +static bool isProfitableToOutline(const BlockSequence &Region, TargetTransformInfo &TTI) { + if (Region.size() > 1) + return true; + int Cost = 0; + const BasicBlock &BB = *Region[0]; for (const Instruction &I : BB) { if (isa(&I) || &I == BB.getTerminator()) continue; @@ -152,153 +156,6 @@ return false; } -/// Identify the maximal region of cold blocks which includes \p SinkBB. -/// -/// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all -/// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which -/// cannot be outlined. -/// -/// Return an empty sequence if the cold region is too small to outline, or if -/// the cold region has no warm predecessors. -static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB, - TargetTransformInfo &TTI, - DominatorTree &DT, - PostDomTree &PDT) { - // The maximal cold region. - BlockSequence ColdRegion = {}; - - // The ancestor farthest-away from SinkBB, and also post-dominated by it. - BasicBlock *MaxAncestor = &SinkBB; - unsigned MaxAncestorHeight = 0; - - // Visit SinkBB's ancestors using inverse DFS. - auto PredIt = ++idf_begin(&SinkBB); - auto PredEnd = idf_end(&SinkBB); - while (PredIt != PredEnd) { - BasicBlock &PredBB = **PredIt; - bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); - - // If SinkBB does not post-dominate a predecessor, do not mark the - // predecessor (or any of its predecessors) cold. - if (!SinkPostDom || !mayExtractBlock(PredBB)) { - PredIt.skipChildren(); - continue; - } - - // Keep track of the post-dominated ancestor farthest away from the sink. - unsigned AncestorHeight = PredIt.getPathLength(); - if (AncestorHeight > MaxAncestorHeight) { - MaxAncestor = &PredBB; - MaxAncestorHeight = AncestorHeight; - } - - ColdRegion.push_back(&PredBB); - ++PredIt; - } - - // CodeExtractor requires that all blocks to be extracted must be dominated - // by the first block to be extracted. - // - // To avoid spurious or repeated outlining, require that the max ancestor - // has a predecessor. By construction this predecessor is not in the cold - // region, i.e. its existence implies we don't outline the whole function. - // - // TODO: If MaxAncestor has no predecessors, we may be able to outline the - // second largest cold region that has a predecessor. - if (pred_empty(MaxAncestor) || - MaxAncestor->getSinglePredecessor() == MaxAncestor) - return {}; - - // Filter out predecessors not dominated by the max ancestor. - // - // TODO: Blocks not dominated by the max ancestor could be extracted as - // other cold regions. Marking outlined calls as noreturn when appropriate - // and outlining more than once per function could achieve most of the win. - auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) { - return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB); - }); - ColdRegion.erase(EraseIt, ColdRegion.end()); - - // Add SinkBB to the cold region. - ColdRegion.push_back(&SinkBB); - - // Ensure that the first extracted block is the max ancestor. - if (ColdRegion[0] != MaxAncestor) { - auto AncestorIt = find(ColdRegion, MaxAncestor); - *AncestorIt = ColdRegion[0]; - ColdRegion[0] = MaxAncestor; - } - - // Find all successors of SinkBB dominated by SinkBB using DFS. - auto SuccIt = ++df_begin(&SinkBB); - auto SuccEnd = df_end(&SinkBB); - while (SuccIt != SuccEnd) { - BasicBlock &SuccBB = **SuccIt; - bool SinkDom = DT.dominates(&SinkBB, &SuccBB); - - // If SinkBB does not dominate a successor, do not mark the successor (or - // any of its successors) cold. - if (!SinkDom || !mayExtractBlock(SuccBB)) { - SuccIt.skipChildren(); - continue; - } - - ColdRegion.push_back(&SuccBB); - ++SuccIt; - } - - if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI)) - return {}; - - return ColdRegion; -} - -/// Get the largest cold region in \p F. -static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI, - BlockFrequencyInfo *BFI, - TargetTransformInfo &TTI, - DominatorTree &DT, PostDomTree &PDT) { - // Keep track of the largest cold region. - BlockSequence LargestColdRegion = {}; - - for (BasicBlock &BB : F) { - // Identify cold blocks. - if (!mayExtractBlock(BB)) - continue; - bool Cold = - PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB)); - if (!Cold) - continue; - - LLVM_DEBUG({ - dbgs() << "Found cold block:\n"; - BB.dump(); - }); - - // Find a maximal cold region we can outline. - BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT); - if (ColdRegion.empty()) { - LLVM_DEBUG(dbgs() << " Skipping (block not profitable to extract)\n"); - continue; - } - - ++NumColdRegionsFound; - - LLVM_DEBUG({ - llvm::dbgs() << "Identified cold region with " << ColdRegion.size() - << " blocks:\n"; - for (BasicBlock *BB : ColdRegion) - BB->dump(); - }); - - // TODO: Outline more than one region. - if (ColdRegion.size() > LargestColdRegion.size()) - LargestColdRegion = std::move(ColdRegion); - } - - return LargestColdRegion; -} - class HotColdSplitting { public: HotColdSplitting(ProfileSummaryInfo *ProfSI, @@ -310,6 +167,10 @@ private: bool shouldOutlineFrom(const Function &F) const; + bool outlineColdRegions(Function &F, ProfileSummaryInfo &PSI, + BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, + DominatorTree &DT, PostDomTree &PDT, + OptimizationRemarkEmitter &ORE); Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, OptimizationRemarkEmitter &ORE, unsigned Count); @@ -431,32 +292,260 @@ return nullptr; } +/// A pair of (basic block, score). +using BlockTy = std::pair; + +/// A maximal outlining region. This contains all blocks post-dominated by a +/// sink block, the sink block itself, and all blocks dominated by the sink. +class OutliningRegion { + /// A list of (block, score) pairs. A block's score is non-zero iff it's a + /// viable sub-region entry point. Blocks with higher scores are better entry + /// points (i.e. they are more distant ancestors of the sink block). + SmallVector Blocks = {}; + + /// The suggested entry point into the region. If the region has multiple + /// entry points, all blocks within the region may not be reachable from this + /// entry point. + BasicBlock *SuggestedEntryPoint = nullptr; + + /// Whether the entire function is cold. + bool EntireFunctionCold = false; + + /// Whether or not \p BB could be the entry point of an extracted region. + static bool isViableEntryPoint(BasicBlock &BB) { return !BB.isEHPad(); } + + /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise. + static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) { + return isViableEntryPoint(BB) ? Score : 0; + } + +public: + static OutliningRegion create(BasicBlock &SinkBB, const DominatorTree &DT, + const PostDomTree &PDT) { + OutliningRegion ColdRegion; + + // The ancestor farthest-away from SinkBB, and also post-dominated by it. + unsigned SinkScore = getEntryPointScore(SinkBB, 1); + ColdRegion.SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr; + unsigned BestScore = SinkScore; + + // Visit SinkBB's ancestors using inverse DFS. + auto PredIt = ++idf_begin(&SinkBB); + auto PredEnd = idf_end(&SinkBB); + while (PredIt != PredEnd) { + BasicBlock &PredBB = **PredIt; + bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB); + + // If SinkBB does not post-dominate a predecessor, do not mark the + // predecessor (or any of its predecessors) cold. + if (!SinkPostDom || !mayExtractBlock(PredBB)) { + PredIt.skipChildren(); + continue; + } + + // If the predecessor is cold and has no predecessors, the entire + // function must be cold. + if (pred_empty(&PredBB)) { + ColdRegion.EntireFunctionCold = true; + return ColdRegion; + } + + // Keep track of the post-dominated ancestor farthest away from the sink. + // Add a large constant to its score as it makes for a better entry point + // (compared to sink-successor blocks). + unsigned PredScore = + getEntryPointScore(PredBB, PredIt.getPathLength() + INT_MAX); + if (PredScore > BestScore) { + ColdRegion.SuggestedEntryPoint = &PredBB; + BestScore = PredScore; + } + + ColdRegion.Blocks.push_back({&PredBB, PredScore}); + ++PredIt; + } + + // Add SinkBB to the cold region. + ColdRegion.Blocks.push_back({&SinkBB, SinkScore}); + + // Find all successors of SinkBB dominated by SinkBB using DFS. + auto SuccIt = ++df_begin(&SinkBB); + auto SuccEnd = df_end(&SinkBB); + while (SuccIt != SuccEnd) { + BasicBlock &SuccBB = **SuccIt; + bool SinkDom = DT.dominates(&SinkBB, &SuccBB); + + // If SinkBB does not dominate a successor, do not mark the successor (or + // anu of its successors) cold. + if (!SinkDom || !mayExtractBlock(SuccBB)) { + SuccIt.skipChildren(); + continue; + } + + unsigned SuccScore = getEntryPointScore(SuccBB, 1); + if (SuccScore > BestScore) { + ColdRegion.SuggestedEntryPoint = &SuccBB; + BestScore = SuccScore; + } + + ColdRegion.Blocks.push_back({&SuccBB, SuccScore}); + ++SuccIt; + } + + return ColdRegion; + } + + /// Whether this region has nothing to extract. + bool empty() const { return !SuggestedEntryPoint; } + + /// The blocks in this region. + ArrayRef> blocks() const { return Blocks; } + + /// Whether the entire function containing this region is cold. + bool isEntireFunctionCold() const { return EntireFunctionCold; } + + /// Remove a sub-region from this region and return it as a block sequence. + BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) { + assert(!empty() && !isEntireFunctionCold() && "Nothing to extract"); + + // Remove blocks dominated by the suggested entry point from this region. + // During the removal, identify the next best entry point into the region. + BasicBlock *NextEntryPoint = nullptr; + unsigned NextScore = 0; + auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) { + BasicBlock *BB = Block.first; + unsigned Score = Block.second; + bool InSubRegion = + BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB); + if (!InSubRegion && Score > NextScore) { + NextEntryPoint = BB; + NextScore = Score; + } + return InSubRegion; + }); + + // Construct the sub-region to extract from the set of removed blocks, + // then erase it from the containing region. + BlockSequence SubRegion; + for (const BlockTy &Block : make_range(RegionStartIt, Blocks.end())) + SubRegion.push_back(Block.first); + Blocks.erase(RegionStartIt, Blocks.end()); + + // Ensure that the first extracted block is the suggested entry point. + if (SubRegion[0] != SuggestedEntryPoint) { + auto *AncestorIt = find(SubRegion, SuggestedEntryPoint); + *AncestorIt = SubRegion[0]; + SubRegion[0] = SuggestedEntryPoint; + } + + // Update the suggested entry point. + SuggestedEntryPoint = NextEntryPoint; + + return SubRegion; + } +}; + +bool HotColdSplitting::outlineColdRegions(Function &F, ProfileSummaryInfo &PSI, + BlockFrequencyInfo *BFI, + TargetTransformInfo &TTI, + DominatorTree &DT, PostDomTree &PDT, + OptimizationRemarkEmitter &ORE) { + bool Changed = false; + + // The set of cold blocks. + SmallPtrSet ColdBlocks; + + // The worklist of non-intersecting regions left to outline. + SmallVector OutliningWorklist; + + // Set up an RPO traversal. Experimentally, this performs better (outlines + // more) than a PO traversal, because we prevent region overlap by keeping + // the first region to contain a block. + ReversePostOrderTraversal RPOT(&F); + + // Find all cold regions. + for (BasicBlock *BB : RPOT) { + // Skip blocks which can't be outlined. + if (!mayExtractBlock(*BB)) + continue; + + // This block is already part of some outlining region. + if (ColdBlocks.count(BB)) + continue; + + bool Cold = + PSI.isColdBB(BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(*BB)); + if (!Cold) + continue; + + LLVM_DEBUG({ + dbgs() << "Found cold block:\n"; + BB->dump(); + }); + + auto Region = OutliningRegion::create(*BB, DT, PDT); + if (Region.empty()) + continue; + + if (Region.isEntireFunctionCold()) { + // TODO: Move this function into a cold section. + LLVM_DEBUG(dbgs() << "Entire function is cold\n"); + return false; + } + + // If this outlining regions intersects with another, drop the new region. + // + // TODO: It's theoretically possible to outline more by only keeping the + // largest region which contains a block, but the extra bookkeeping to do + // this is tricky/expensive. + bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) { + return !ColdBlocks.insert(Block.first).second; + }); + if (RegionsOverlap) + continue; + + OutliningWorklist.push_back(std::move(Region)); + ++NumColdRegionsFound; + } + + // Outline single-entry cold regions, splitting up larger regions as needed. + while (!OutliningWorklist.empty()) { + OutliningRegion Region = OutliningWorklist.pop_back_val(); + assert(!Region.empty() && "Empty outlining region in worklist"); + do { + BlockSequence SubRegion = Region.takeSingleEntrySubRegion(DT); + if (!isProfitableToOutline(SubRegion, TTI)) { + LLVM_DEBUG(dbgs() << "Skipping outlining; not profitable to outline\n"); + continue; + } + + Function *Outlined = + extractColdRegion(SubRegion, DT, BFI, TTI, ORE, /*Count=*/1); + if (Outlined) { + OutlinedFunctions.insert(Outlined); + Changed = true; + } + } while (!Region.empty()); + } + + return Changed; +} + bool HotColdSplitting::run(Module &M) { bool Changed = false; + OutlinedFunctions.clear(); for (auto &F : M) { if (!shouldOutlineFrom(F)) { - LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n"); + LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n"); continue; } - LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n"); DominatorTree DT(F); PostDomTree PDT(F); PDT.recalculate(F); BlockFrequencyInfo *BFI = GetBFI(F); TargetTransformInfo &TTI = GetTTI(F); - - BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT); - if (ColdRegion.empty()) - continue; - OptimizationRemarkEmitter &ORE = (*GetORE)(F); - Function *Outlined = - extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1); - if (Outlined) { - OutlinedFunctions.insert(Outlined); - Changed = true; - } + Changed |= outlineColdRegions(F, *PSI, BFI, TTI, DT, PDT, ORE); } return Changed; }