Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -157,6 +157,7 @@ void initializeGlobalSplitPass(PassRegistry&); void initializeGlobalsAAWrapperPassPass(PassRegistry&); void initializeGuardWideningLegacyPassPass(PassRegistry&); +void initializeHotColdSplittingLegacyPassPass(PassRegistry&); void initializeIPCPPass(PassRegistry&); void initializeIPSCCPLegacyPassPass(PassRegistry&); void initializeIRTranslatorPass(PassRegistry&); Index: include/llvm/Transforms/IPO.h =================================================================== --- include/llvm/Transforms/IPO.h +++ include/llvm/Transforms/IPO.h @@ -210,6 +210,11 @@ createMergeSimilarFunctionsPass(const ModuleSummaryIndex *S = nullptr); //===----------------------------------------------------------------------===// +/// createHotColdSplittingPass - This pass outlines cold blocks into a separate +/// function(s). +ModulePass *createHotColdSplittingPass(); + +//===----------------------------------------------------------------------===// /// createPartialInliningPass - This pass inlines parts of functions. /// ModulePass *createPartialInliningPass(); Index: lib/Transforms/IPO/CMakeLists.txt =================================================================== --- lib/Transforms/IPO/CMakeLists.txt +++ lib/Transforms/IPO/CMakeLists.txt @@ -17,6 +17,7 @@ GlobalDCE.cpp GlobalOpt.cpp GlobalSplit.cpp + HotColdSplitting.cpp IPConstantPropagation.cpp IPO.cpp InferFunctionAttrs.cpp Index: lib/Transforms/IPO/HotColdSplitting.cpp =================================================================== --- /dev/null +++ lib/Transforms/IPO/HotColdSplitting.cpp @@ -0,0 +1,351 @@ +//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Outline cold regions to a separate function. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/BlockFrequency.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include +#include + +#define DEBUG_TYPE "hotcoldsplit" + +STATISTIC(NumColdSESEFound, + "Number of cold single entry single exit (SESE) regions found."); +STATISTIC(NumColdSESEOutlined, + "Number of cold single entry single exit (SESE) regions outlined."); + +using namespace llvm; + +static cl::opt EnableStaticAnalyis("hot-cold-static-analysis", + cl::init(true), cl::Hidden); + + +namespace { + +struct PostDomTree : PostDomTreeBase { + PostDomTree(Function &F) { recalculate(F); } +}; + +typedef DenseSet DenseSetBB; + +// From: https://reviews.llvm.org/D22558 +// Exit is not part of the region. +static bool +isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit, + DominatorTree *DT, PostDomTree *PDT, + SmallVectorImpl &Region) { + if (!DT->dominates(Entry, Exit)) + return false; + + if (!PDT->dominates(Exit, Entry)) + return false; + + Region.push_back(Entry); + for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) { + if (*I == Exit) { + I.skipChildren(); + continue; + } + if (!DT->dominates(Entry, *I)) + return false; + Region.push_back(*I); + ++I; + } + return true; +} + +bool blockEndsInUnreachable(const BasicBlock &BB) { + if (BB.empty()) + return true; + const TerminatorInst *I = BB.getTerminator(); + if (isa(I) || isa(I)) + return true; + // Unreachable blocks do not have any successor. + return succ_empty(&BB); +} + +static +bool unlikelyExecuted(const BasicBlock &BB) { + if (blockEndsInUnreachable(BB)) + return true; + // Exception handling blocks are unlikely executed. + if (BB.isEHPad()) + return true; + for (const Instruction &I : BB) + if (const CallInst *CI = dyn_cast(&I)) { + // The block is cold if it calls functions tagged as cold or noreturn. + if (CI->hasFnAttr(Attribute::Cold) || + CI->hasFnAttr(Attribute::NoReturn)) + return true; + + // Assume that inline assembly is hot code. + if (isa(CI->getCalledValue())) + return false; + } + return false; +} + +static DenseSetBB getColdBlocks(Function &F) { + // First mark all function basic blocks as hot or cold. + DenseSet ColdBlocks; + for (BasicBlock &BB : F) + if (unlikelyExecuted(BB)) + ColdBlocks.insert(&BB); + // Forward propagation. + DenseSetBB AllColdBlocks; + SmallVector WL; + DenseSetBB Visited; + + const BasicBlock *It = &F.front(); + const TerminatorInst *TI = It->getTerminator(); + if (!ColdBlocks.count(It)) { + Visited.insert(It); + // Breadth First Search to mark edges not reachable from cold. + // Basically we want to skip a cold basic block (CB) and all blocks + // dominated by CB. TODO: Maybe PHI-insertion algorithm can be used here. + WL.push_back(It); + while (WL.size() > 0) { + It = WL.pop_back_val(); + for (const BasicBlock *Succ : TI->successors()) { + // Do not visit blocks that are cold. + if (!ColdBlocks.count(Succ) && !Visited.count(Succ)) { + Visited.insert(Succ); + WL.push_back(Succ); + } + } + } + } + + // Mark all the blocks that were not visited as cold. + for (const BasicBlock &BB : F) + if (!Visited.count(&BB)) + AllColdBlocks.insert(&BB); + // TODO: Backward propagation to construct larger cold region + // as a union of smaller cold regions. e.g., an if-block branches out to + // two cold regions then the if block is also cold. CHI nodes to track + // cold edges can be used for faster backward propagation. + return AllColdBlocks; +} + +class HotColdSplitting { +public: + HotColdSplitting(ProfileSummaryInfo *ProfSI, + function_ref GBFI, + std::function *GORE) + : PSI(ProfSI), GetBFI(GBFI), GetORE(GORE) {} + bool run(Module &M); + +private: + bool shouldOutlineFrom(const Function &F) const; + Function *outlineColdBlocks(Function &F, + const DenseSet &ColdBlock, + DominatorTree *DT, PostDomTree *PDT); + Function *extractColdRegion(const SmallVectorImpl &Region, + DominatorTree *DT, BlockFrequencyInfo *BFI, + OptimizationRemarkEmitter &ORE); + bool isOutlineCandidate(const SmallVectorImpl &Region, + const BasicBlock *Exit) const { + // TODO: Find a better metric to compute the size of region being outlined. + if (Region.size() == 1) + return false; + // Regions with landing pads etc. + for (const BasicBlock *BB : Region) { + if (BB->isEHPad() || BB->hasAddressTaken()) + return false; + } + return true; + } + SmallVector OutlinedFunctions; + ProfileSummaryInfo *PSI; + function_ref GetBFI; + std::function *GetORE; +}; + +class HotColdSplittingLegacyPass : public ModulePass { +public: + static char ID; + HotColdSplittingLegacyPass() : ModulePass(ID) { + initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + } + + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace + +bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { + if (F.size() <= 2) + return false; + + if (F.hasAddressTaken()) + return false; + + if (F.hasFnAttribute(Attribute::AlwaysInline)) + return false; + + if (F.hasFnAttribute(Attribute::NoInline)) + return false; + + if (PSI->isFunctionEntryCold(&F)) + return false; + return true; +} + +Function * +HotColdSplitting::extractColdRegion(const SmallVectorImpl &Region, + DominatorTree *DT, BlockFrequencyInfo *BFI, + OptimizationRemarkEmitter &ORE) { + CodeExtractor CE(Region, DT, /*AggregateArgs*/ false, BFI, nullptr, + /* AllowVarargs */ false); + + SetVector Inputs, Outputs, Sinks; + CE.findInputsOutputs(Inputs, Outputs, Sinks); + + // Do not extract regions that have live exit variables. + if (Outputs.size() > 0) + return nullptr; + + if (Function *OutF = CE.extractCodeRegion()) { + User *U = *OutF->user_begin(); + CallInst *CI = cast(U); + CallSite CS(CI); + OutlinedFunctions.push_back(OutF); + NumColdSESEOutlined++; + OutF->setCallingConv(CallingConv::Cold); + CS.setCallingConv(CallingConv::Cold); + CI->setIsNoInline(); + DEBUG(llvm::dbgs() << "Outlined Region at block: " << Region.front()); + return OutF; + } + + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", + &*Region[0]->begin()) + << "Failed to extract region at block " + << ore::NV("Block", Region.front()); + }); + return nullptr; +} + +// Return the function created after outlining, nullptr otherwise. +Function *HotColdSplitting::outlineColdBlocks(Function &F, + const DenseSetBB &ColdBlock, + DominatorTree *DT, + PostDomTree *PDT) { + auto BFI = GetBFI(F); + auto &ORE = (*GetORE)(F); + // Walking the dominator tree allows us to find the largest + // cold region. + BasicBlock *Begin = DT->getRootNode()->getBlock(); + for (auto I = df_begin(Begin), E = df_end(Begin); I != E; ++I) { + BasicBlock *BB = *I; + if (PSI->isColdBB(BB, BFI) || ColdBlock.count(BB)) { + // Estimated cold region between a BB and its dom-frontier. + BasicBlock *Exit = (*PDT)[BB]->getIDom()->getBlock(); + SmallVector Region; + if (isSingleEntrySingleExit(BB, Exit, DT, PDT, Region) && + isOutlineCandidate(Region, Exit)) { + ++NumColdSESEFound; + // Candidate for outlining. FIXME: Continue outlining. + return extractColdRegion(Region, DT, BFI, ORE); + } + } + } + return nullptr; +} + +bool HotColdSplitting::run(Module &M) { + for (auto &F : M) { + if (!shouldOutlineFrom(F)) + continue; + DominatorTree DT(F); + PostDomTree PDT(F); + PDT.recalculate(F); + DenseSetBB ColdBlocks; + if (EnableStaticAnalyis) // Static analysis of cold blocks. + ColdBlocks = getColdBlocks(F); + + outlineColdBlocks(F, ColdBlocks, &DT, &PDT); + } + return true; +} + +bool HotColdSplittingLegacyPass::runOnModule(Module &M) { + if (skipModule(M)) + return false; + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + auto GBFI = [this](Function &F) { + return &this->getAnalysis(F).getBFI(); + }; + std::unique_ptr ORE; + std::function GetORE = + [&ORE](Function &F) -> OptimizationRemarkEmitter & { + ORE.reset(new OptimizationRemarkEmitter(&F)); + return *ORE.get(); + }; + + return HotColdSplitting(PSI, GBFI, &GetORE).run(M); +} + +char HotColdSplittingLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit", + "Hot Cold Splitting", false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit", + "Hot Cold Splitting", false, false) + +ModulePass *llvm::createHotColdSplittingPass() { + return new HotColdSplittingLegacyPass(); +} Index: lib/Transforms/IPO/IPO.cpp =================================================================== --- lib/Transforms/IPO/IPO.cpp +++ lib/Transforms/IPO/IPO.cpp @@ -36,6 +36,7 @@ initializeGlobalDCELegacyPassPass(Registry); initializeGlobalOptLegacyPassPass(Registry); initializeGlobalSplitPass(Registry); + initializeHotColdSplittingLegacyPassPass(Registry); initializeIPCPPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry);