Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -110,6 +110,14 @@ bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); /// \brief Returns true if Callsite \p CS is considered cold. bool isColdCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); + /// \brief Returns HotCountThreshold if set. + uint64_t getHotCountThreshold() { + return HotCountThreshold ? HotCountThreshold.getValue() : 0; + } + /// \brief Returns ColdCountThreshold if set. + uint64_t getColdCountThreshold() { + return ColdCountThreshold ? ColdCountThreshold.getValue() : 0; + } }; /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo. Index: lib/Transforms/IPO/PartialInlining.cpp =================================================================== --- lib/Transforms/IPO/PartialInlining.cpp +++ lib/Transforms/IPO/PartialInlining.cpp @@ -22,10 +22,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DominanceFrontier.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -67,17 +71,73 @@ STATISTIC(NumPartialInlined, "Number of callsites functions partially inlined into."); +STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " + "cold outlined regions were partially " + "inlined into its caller(s)."); +STATISTIC(NumColdRegionsFound, + "Number of cold single entry/exit regions found."); +STATISTIC(NumColdRegionsOutlined, + "Number of cold single entry/exit regions outlined."); // Command line option to disable partial-inlining. The default is false: static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), - cl::Hidden, cl::desc("Disable partial ininling")); + cl::Hidden, cl::desc("Disable partial inlining")); +// Command line option to disable multi-region partial-inlining. The default is +// false: +static cl::opt DisableMultiRegionPartialInline( + "disable-mr-partial-inlining", cl::init(false), cl::Hidden, + cl::desc("Disable multi-region partial inlining")); + +// Command line option to force outlining in regions with live exit variables. +// The default is false: +static cl::opt + ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, + cl::desc("Force outline regions with live exits")); + +// Command line option to disable marking outline functions with Cold Calling +// Convention. The default is false: +static cl::opt + DoNotMarkColdCC("pi-no-mark-coldcc", cl::init(false), cl::Hidden, + cl::desc("Do not outline function calls with ColdCC")); + +enum TraceLevel { + None, + Sparse, + Full, +}; +// Command line option to debug partial-inlining. The default is none: +static cl::opt PartialInliningTraceLevel( + "trace-partial-inlining", cl::init(TraceLevel::None), cl::Hidden, + cl::desc("Trace partial inlining level"), + cl::values(clEnumValN(TraceLevel::None, "none", "None"), + clEnumValN(TraceLevel::Sparse, "sparse", "Sparse"), + clEnumValN(TraceLevel::Full, "full", "Full"))); // This is an option used by testing: static cl::opt SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::desc("Skip Cost Analysis")); +// Used to determine if a cold region is worth outlining based on +// its inlining cost compared to the original function. Default is set at 10%. +// ie. if the cold region reduces the inlining cost of the original function by +// at least 10%. +static cl::opt MinRegionSizeRatio( + "min-region-size-ratio", cl::init(0.1), cl::Hidden, + cl::desc("Minimum ratio comparing relative sizes of each " + "outline candidate and original function")); +// Used to tune the minimum number of execution counts needed in the predecessor +// block to the cold edge. ie. confidence interval. +static cl::opt + MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, + cl::desc("Minimum block executions to consider " + "its BranchProbabilityInfo valid")); +// Used to determine when an edge is considered cold. Default is set to 10%. ie. +// if the branch probability is 10% or less, then it is deemed as 'cold'. +static cl::opt ColdBranchRatio( + "cold-branch-ratio", cl::init(0.1), cl::Hidden, + cl::desc("Minimum BranchProbability to consider a region cold.")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -102,6 +162,9 @@ "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one.")); +static bool TracePartialInliningSparse = false; +static bool TracePartialInliningFull = false; + namespace { struct FunctionOutliningInfo { @@ -125,6 +188,26 @@ SmallVector ReturnBlockPreds; }; +struct FunctionOutliningMultiRegionInfo { + FunctionOutliningMultiRegionInfo() + : ORI() {} + + // Container for outline regions + struct OutlineRegionInfo { + OutlineRegionInfo(SmallVector Region, + BasicBlock *EntryBlock, BasicBlock *ExitBlock, + BasicBlock *ReturnBlock) + : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), + ReturnBlock(ReturnBlock) {} + SmallVector Region; + BasicBlock *EntryBlock; + BasicBlock *ExitBlock; + BasicBlock *ReturnBlock; + }; + + SmallVector ORI; +}; + struct PartialInlinerImpl { PartialInlinerImpl( std::function *GetAC, @@ -134,14 +217,24 @@ : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} bool run(Module &M); - Function *unswitchFunction(Function *F); + // Main part of the transformation that calls helper functions to find + // outlining candidates, clone & outline the function, and attempt to + // partially inline the resulting function. Returns true if + // inlining was successful, false otherwise. Also returns the outline + // function (only if we partially inlined early returns) as there is a + // possibility to further "peel" early return statements that were left in the + // outline function due to code size. + std::pair unswitchFunction(Function *F); // This class speculatively clones the the function to be partial inlined. // At the end of partial inlining, the remaining callsites to the cloned // function that are not partially inlined will be fixed up to reference // the original function, and the cloned function will be erased. struct FunctionCloner { + // Two constructors, one for single region outlining, the other for + // multi-region outlining. FunctionCloner(Function *F, FunctionOutliningInfo *OI); + FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI); ~FunctionCloner(); // Prepare for function outlining: making sure there is only @@ -149,19 +242,27 @@ // the return block. void NormalizeReturnBlock(); - // Do function outlining: - Function *doFunctionOutlining(); + // Do function outlining for cold regions. + bool doMultiRegionFunctionOutlining(); + // Do function outlining for region after early return block(s). + Function *doSingleRegionFunctionOutlining(); Function *OrigFunc = nullptr; Function *ClonedFunc = nullptr; - Function *OutlinedFunc = nullptr; - BasicBlock *OutliningCallBB = nullptr; + + typedef std::pair FuncBodyCallerPair; + // Keep track of Outlined Functions and the basic block they're called from. + SmallVector OutlinedFunctions; + // ClonedFunc is inlined in one of its callers after function // outlining. bool IsFunctionInlined = false; // The cost of the region to be outlined. int OutlinedRegionCost = 0; + // ClonedOI is specific to outlining non-early return blocks. std::unique_ptr ClonedOI = nullptr; + // ClonedOMRI is specific to outlining cold regions. + std::unique_ptr ClonedOMRI = nullptr; std::unique_ptr ClonedFuncBFI = nullptr; }; @@ -189,6 +290,8 @@ // if there is any successful inlining. bool tryPartialInline(FunctionCloner &Cloner); + bool tryNewPartialInline(FunctionCloner &Cloner); + // Compute the mapping from use site of DuplicationFunction to the enclosing // BB's profile count. void computeCallsiteToProfCountMap(Function *DuplicateFunction, @@ -236,6 +339,8 @@ static int computeBBInlineCost(BasicBlock *BB); std::unique_ptr computeOutliningInfo(Function *F); + std::unique_ptr computeOutliningColdRegionsInfo(Function *F); + }; struct PartialInlinerLegacyPass : public ModulePass { @@ -271,12 +376,196 @@ return TTIWP->getTTI(F); }; - return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); + return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI).run(M); } }; } // end anonymous namespace +struct PostDomTree : PostDomTreeBase { + PostDomTree(Function &F) { recalculate(F); } +}; + +std::unique_ptr +PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) { + BasicBlock *EntryBlock = &F->front(); + BranchInst *BR = dyn_cast(EntryBlock->getTerminator()); + if (!BR || BR->isUnconditional()) + return std::unique_ptr(); + + DominatorTree DT(*F); + PostDominatorTree PDT; + PDT.recalculate(*F); + DominanceFrontier DF; + DF.analyze(DT); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*F, LI); + if (TracePartialInliningFull) { + dbgs() << "BranchProbabilityInfo:\n"; + BPI.print(dbgs()); + } + std::unique_ptr ScopedBFI; + BlockFrequencyInfo *BFI; + if (!GetBFI) { + ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); + BFI = ScopedBFI.get(); + } else + BFI = &(*GetBFI)(*F); + + if (TracePartialInliningFull) { + dbgs() << "BlockFrequencyInfo:\n"; + BFI->print(dbgs()); + dbgs() << "DominatorTree:\n"; + DT.print(dbgs()); + } + + auto IsReturnBlock = [](BasicBlock *BB) { + TerminatorInst *TI = BB->getTerminator(); + return isa(TI); + }; + + // Return if we don't have profiling information. + if (!PSI->hasProfileSummary() || !PSI->hasInstrumentationProfile()) + return std::unique_ptr(); + + std::unique_ptr OutliningInfo = + llvm::make_unique(); + + auto IsSingleEntry = [](SmallVectorImpl &BlockList) { + BasicBlock *Dom = BlockList.front(); + return BlockList.size() > 1 && + std::distance(pred_begin(Dom), pred_end(Dom)) == 1; + }; + + auto IsSingleExit = + [IsReturnBlock](SmallVectorImpl &BlockList) -> BasicBlock * { + BasicBlock *ExitBlock = nullptr; + for (auto *Block : BlockList) { + if (IsReturnBlock(Block)) + return nullptr; + for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { + if (!is_contained(BlockList, *SI)) { + if (TracePartialInliningFull) + dbgs() << "Found exit block = " << (*SI)->getName() << "\n"; + if (ExitBlock) + return nullptr; + else + ExitBlock = Block; + } + } + } + return ExitBlock; + }; + + auto BBProfileCount = [BFI](BasicBlock *BB) { + return BFI->getBlockProfileCount(BB) + ? BFI->getBlockProfileCount(BB).getValue() + : 0; + }; + + // Use the same computeBBInlineCost function to compute the cost savings of + // the outlining the candidate region. + int OverallFunctionCost = 0; + for (auto &BB : *F) + OverallFunctionCost += computeBBInlineCost(&BB); + + int MinOutlineRegionCost = + static_cast(OverallFunctionCost * MinRegionSizeRatio); + BranchProbability MinBranchProbability( + static_cast(ColdBranchRatio * MinBlockCounterExecution), + MinBlockCounterExecution); + bool ColdCandidateFound = false; + BasicBlock *CurrEntry = EntryBlock; + std::vector DFS; + DenseMap VisitedMap; + DFS.push_back(CurrEntry); + VisitedMap[CurrEntry] = true; + // Use Depth First Search on the basic blocks to find CFG edges that are + // considered cold. + // Cold regions considered must also have its inline cost compared to the + // overall inline cost of the original function. The region is outlined only + // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or + // more. + while (!DFS.empty()) { + auto *thisBB = DFS.back(); + DFS.pop_back(); + // Only consider regions with predecessor blocks that are considered + // not-cold (default: part of the top 99.99% of all block counters) + // AND greater than our minimum block execution count (default: 100). + if (PSI->isColdBB(thisBB, BFI) || + BBProfileCount(thisBB) < MinBlockCounterExecution) + continue; + for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { + if (VisitedMap[*SI]) + continue; + VisitedMap[*SI] = true; + DFS.push_back(*SI); + // If branch isn't cold, we skip to the next one. + BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); + if (SuccProb > MinBranchProbability) + continue; + if (TracePartialInliningFull) { + dbgs() << "Found cold edge: " << thisBB->getName() << "->" + << (*SI)->getName() << "\n"; + dbgs() << "Branch Probability = " << SuccProb << "\n"; + } + SmallVector DominateVector; + DT.getDescendants(*SI, DominateVector); + // We can only outline single entry regions (for now). + if (!IsSingleEntry(DominateVector)) + continue; + if (TracePartialInliningFull) + dbgs() << "Dominates more than one block. Single predecessor.\n"; + // We can only outline single exit regions (for now). + if (auto *ExitBlock = IsSingleExit(DominateVector)) { + int OutlineRegionCost = 0; + for (auto *BB : DominateVector) + OutlineRegionCost += computeBBInlineCost(BB); + + if (TracePartialInliningFull) { + dbgs() << "Single Exit Block = " << ExitBlock->getName() << "\n"; + dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; + dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; + } + + if (OutlineRegionCost < MinOutlineRegionCost) { + if (TracePartialInliningFull) { + dbgs() << "Inline cost-savings smaller than " + << MinOutlineRegionCost << ". Skip.\n"; + } + continue; + } + // For now, ignore blocks that belong to a SISE region that is a + // candidate for outlining. In the future, we may want to look + // at inner regions because the outer region may have live-exit + // variables. + for (auto *BB : DominateVector) + VisitedMap[BB] = true; + // can't outline if region contains a return block + if (IsReturnBlock(ExitBlock)) + continue; + // ReturnBlock here means the block after the outline call + BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); + // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); + FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( + DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); + RegInfo.Region = DominateVector; + OutliningInfo->ORI.push_back(RegInfo); + if (TracePartialInliningFull) { + dbgs() << "Found Cold Candidate starting at block: " + << DominateVector.front()->getName() << "\n"; + } + ColdCandidateFound = true; + NumColdRegionsFound++; + } + } + } + if (ColdCandidateFound) + return OutliningInfo; + else + return std::unique_ptr(); +} + std::unique_ptr PartialInlinerImpl::computeOutliningInfo(Function *F) { BasicBlock *EntryBlock = &F->front(); @@ -448,14 +737,19 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { + BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; auto EntryFreq = Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); auto OutliningCallFreq = - Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); - - auto OutlineRegionRelFreq = - BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), - EntryFreq.getFrequency()); + Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); + // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE + // we outlined any regions, so we may encounter situations where the + // OutliningCallFreq is *slightly* bigger than the EntryFreq. + if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { + OutliningCallFreq = EntryFreq; + } + auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( + OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) return OutlineRegionRelFreq; @@ -600,13 +894,16 @@ std::tuple PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { + Function *OutlinedFunc; + BasicBlock* OutliningCallBB; + std::tie(OutlinedFunc, OutliningCallBB) = Cloner.OutlinedFunctions.back(); // Now compute the cost of the call sequence to the outlined function // 'OutlinedFunction' in BB 'OutliningCallBB': - int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); + int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); // Now compute the cost of the extracted/outlined function itself: int OutlinedFunctionCost = 0; - for (BasicBlock &BB : *Cloner.OutlinedFunc) { + for (BasicBlock &BB : *OutlinedFunc) { OutlinedFunctionCost += computeBBInlineCost(&BB); } @@ -692,6 +989,37 @@ F->replaceAllUsesWith(ClonedFunc); } +PartialInlinerImpl::FunctionCloner::FunctionCloner( + Function *F, FunctionOutliningMultiRegionInfo *OI) + : OrigFunc(F) { + ClonedOMRI = llvm::make_unique(); + + // Clone the function, so that we can hack away on it. + ValueToValueMapTy VMap; + ClonedFunc = CloneFunction(F, VMap); + + // Go through all Outline Candidate Regions and update all BasicBlock + // information. + for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : + OI->ORI) { + SmallVector Region; + for (BasicBlock *BB : RegionInfo.Region) { + Region.push_back(cast(VMap[BB])); + } + BasicBlock *NewEntryBlock = cast(VMap[RegionInfo.EntryBlock]); + BasicBlock *NewExitBlock = cast(VMap[RegionInfo.ExitBlock]); + BasicBlock *NewReturnBlock = nullptr; + if (RegionInfo.ReturnBlock) + NewReturnBlock = cast(VMap[RegionInfo.ReturnBlock]); + FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( + Region, NewEntryBlock, NewExitBlock, NewReturnBlock); + ClonedOMRI->ORI.push_back(MappedRegionInfo); + } + // Go ahead and update all uses to the duplicate, so that we can just + // use the inliner functionality when we're done hacking. + F->replaceAllUsesWith(ClonedFunc); +} + void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { auto getFirstPHI = [](BasicBlock *BB) { BasicBlock::iterator I = BB->begin(); @@ -708,6 +1036,11 @@ return FirstPhi; }; + // Shouldn't need to normalize PHIs if we're not outlining non-early return + // blocks. + if (!ClonedOI) + return; + // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some @@ -758,16 +1091,84 @@ DeadPhis.push_back(OldPhi); } ++I; + } + for (auto *DP : DeadPhis) + DP->eraseFromParent(); + + for (auto E : ClonedOI->ReturnBlockPreds) { + E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + } +} + +bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { + + auto ComputeRegionCost = [](SmallVectorImpl &Region) { + int Cost = 0; + for (BasicBlock* BB : Region) + Cost += computeBBInlineCost(BB); + return Cost; + }; + + assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); + + if (ClonedOMRI->ORI.empty()) + return false; + + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.recalculate(*ClonedFunc); + + // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); + + SetVector Inputs, Outputs, Sinks; + for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : ClonedOMRI->ORI) { + + OutlinedRegionCost += ComputeRegionCost(RegionInfo.Region); + + CodeExtractor CE (RegionInfo.Region, &DT, /*AggregateArgs*/ false, ClonedFuncBFI.get(), &BPI); + + CE.findInputsOutputs(Inputs, Outputs, Sinks); + + + if (TracePartialInliningFull) { + DEBUG(dbgs() << "inputs: " << Inputs.size() << "\n"); + DEBUG(dbgs() << "outputs: " << Outputs.size() << "\n"); + for (Value *value : Inputs) { + DEBUG(dbgs() << "value used in func: " << *value << "\n"); + } + + for (Value *output : Outputs) { + DEBUG(dbgs() << "instr used in func: " << *output << "\n"); + } } - for (auto *DP : DeadPhis) - DP->eraseFromParent(); - for (auto E : ClonedOI->ReturnBlockPreds) { - E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + // Do not extract regions that have live exit variables. + if (Outputs.size() > 0 && !ForceLiveExit) + continue; + + Function *OutlinedFunc = CE.extractCodeRegion(); + + if (OutlinedFunc) { + CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); + BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); + assert(OutliningCallBB->getParent() == ClonedFunc); + OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); + NumColdRegionsOutlined++; + + if (!DoNotMarkColdCC) { + OutlinedFunc->setCallingConv(CallingConv::Cold); + OCS.setCallingConv(CallingConv::Cold); + } } + } + + return !OutlinedFunctions.empty(); } -Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { +Function * PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&, this](BasicBlock *BB) { @@ -776,6 +1177,16 @@ ClonedOI->Entries.end()); }; + assert(ClonedOI && "Expecting OutlineInfo for single region outline"); + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.recalculate(*ClonedFunc); + + // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); + // Gather up the blocks that we're going to extract. std::vector ToExtract; ToExtract.push_back(ClonedOI->NonReturnBlock); @@ -791,25 +1202,18 @@ OutlinedRegionCost += computeBBInlineCost(&BB); } - // The CodeExtractor needs a dominator tree. - DominatorTree DT; - DT.recalculate(*ClonedFunc); - - // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. - LoopInfo LI(DT); - BranchProbabilityInfo BPI(*ClonedFunc, LI); - ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); - // Extract the body of the if. - OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, - ClonedFuncBFI.get(), &BPI) - .extractCodeRegion(); + Function *OutlinedFunc = + CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, + ClonedFuncBFI.get(), &BPI) + .extractCodeRegion(); if (OutlinedFunc) { - OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) + BasicBlock *OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) .getInstruction() ->getParent(); assert(OutliningCallBB->getParent() == ClonedFunc); + OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); } return OutlinedFunc; @@ -821,52 +1225,107 @@ ClonedFunc->replaceAllUsesWith(OrigFunc); ClonedFunc->eraseFromParent(); if (!IsFunctionInlined) { - // Remove the function that is speculatively created if there is no + // Remove each function that was speculatively created if there is no // reference. - if (OutlinedFunc) - OutlinedFunc->eraseFromParent(); + for (auto FuncBBPair : OutlinedFunctions) { + Function *Func = FuncBBPair.first; + Func->eraseFromParent(); + } } } -Function *PartialInlinerImpl::unswitchFunction(Function *F) { +std::pair PartialInlinerImpl::unswitchFunction(Function *F) { + if (F->hasAddressTaken()) - return nullptr; + return {false,nullptr}; // Let inliner handle it if (F->hasFnAttribute(Attribute::AlwaysInline)) - return nullptr; + return {false,nullptr}; if (F->hasFnAttribute(Attribute::NoInline)) - return nullptr; + return {false,nullptr}; if (PSI->isFunctionEntryCold(F)) - return nullptr; + return {false,nullptr}; if (F->user_begin() == F->user_end()) - return nullptr; + return {false,nullptr}; - std::unique_ptr OI = computeOutliningInfo(F); + if (TracePartialInliningFull) { + dbgs() << ">>>>>> Original Function >>>>>>\n"; + F->print(dbgs()); + dbgs() << "<<<<<< Original Function <<<<<<\n"; + } + + // Only try to outline cold regions if we have a profile summary, which + // implies we have profiling information. + if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() && + !DisableMultiRegionPartialInline) { + std::unique_ptr OMRI = + computeOutliningColdRegionsInfo(F); + if (OMRI) { + FunctionCloner Cloner(F, OMRI.get()); + + if (TracePartialInliningFull) { + dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; + dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() + << "\n"; + } + bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); + + if (DidOutline) { + if (TracePartialInliningFull) { + dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; + Cloner.ClonedFunc->print(dbgs()); + dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; + } + + if (tryNewPartialInline(Cloner)) + return {true, nullptr}; + } + } + } + // Fall-thru to regular partial inlining if we: + // i) can't find any cold regions to outline, or + // ii) can't inline the outlined function anywhere. + std::unique_ptr OI = computeOutliningInfo(F); if (!OI) - return nullptr; + return {false, nullptr}; FunctionCloner Cloner(F, OI.get()); Cloner.NormalizeReturnBlock(); - Function *OutlinedFunction = Cloner.doFunctionOutlining(); + + if (TracePartialInliningFull) { + dbgs() << "ClonedFunc: \n"; + Cloner.ClonedFunc->print(dbgs()); + } + + Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); + + if (OutlinedFunction) { + if (TracePartialInliningFull) + dbgs() << "Outlined at least one region. Try Inlining.\n"; + } else { + if (TracePartialInliningFull) + dbgs() << "Didn't outline any functions.\n"; + return {false, nullptr}; + } bool AnyInline = tryPartialInline(Cloner); if (AnyInline) - return OutlinedFunction; + return {true, OutlinedFunction}; - return nullptr; + return {false, nullptr}; } bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { int NonWeightedRcost; int SizeCost; - if (Cloner.OutlinedFunc == nullptr) + if (Cloner.OutlinedFunctions.empty()) return false; std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); @@ -947,10 +1406,82 @@ return AnyInline; } +static bool +shouldInline(InlineCost IC) { + // ORE would shure be useful here ... + if(IC.isNever()) + return false; + + if(IC.isAlways()) + return true; + + return IC.getCostDelta() >= 0; +} + +bool PartialInlinerImpl::tryNewPartialInline(FunctionCloner &Cloner) { + // loop over all the call sites that call the cloned function. + assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && + "F's users should all be replaced!"); + assert(Cloner.OutlinedFunctions.size() && + "Expected at least 1 outlined reigon!"); + std::vector Users(Cloner.ClonedFunc->user_begin(), + Cloner.ClonedFunc->user_end()); + OptimizationRemarkEmitter ORE(Cloner.OrigFunc); + + DenseMap CallSiteToProfCountMap; + assert(Cloner.OrigFunc->getEntryCount()); + computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); + uint64_t CalleeEntryCount = *Cloner.OrigFunc->getEntryCount(); + + bool AnyInlined = false; + for (User *User : Users) { + CallSite CS = getCallSite(User); + Function *Callee = CS.getCalledFunction(); + auto &CalleeTTI = (*GetTTI)(*Callee); + InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, + *GetAssumptionCache, GetBFI, PSI, &ORE); + if (!shouldInline(IC)) + continue; + + if (TracePartialInliningFull || TracePartialInliningSparse) { + dbgs() << "Partial Inlined callee " << Callee->getName(); + dbgs() << " in caller " << User->getName() << "in file "; + dbgs() << Callee->getParent()->getName() << "\n"; + } + ++NumColdOutlinePartialInlined; + + AnyInlined = true; + InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); + InlineFunction(CS, IFI); + + if (CallSiteToProfCountMap.count(User)) + CalleeEntryCount -= + std::min(CalleeEntryCount, CallSiteToProfCountMap[User]); + } + + if (AnyInlined) { + Cloner.IsFunctionInlined = true; + Cloner.OrigFunc->setEntryCount(CalleeEntryCount); + } + + return AnyInlined; +} + bool PartialInlinerImpl::run(Module &M) { if (DisablePartialInlining) return false; + switch (PartialInliningTraceLevel) { + case TraceLevel::Full: + TracePartialInliningFull = true; + // Fall thru + case TraceLevel::Sparse: + TracePartialInliningSparse = true; + break; + default: + break; + } + std::vector Worklist; Worklist.reserve(M.size()); for (Function &F : M) @@ -975,11 +1506,14 @@ if (Recursive) continue; - if (Function *NewFunc = unswitchFunction(CurrFunc)) { - Worklist.push_back(NewFunc); + std::pair Result = unswitchFunction(CurrFunc); + if (Result.second) + Worklist.push_back(Result.second); + if (Result.first) Changed = true; - } } + if (TracePartialInliningFull && Changed) + dbgs() << "Partially inlined at least one function.\n"; return Changed; }