Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -110,6 +110,14 @@ bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); /// \brief Returns true if Callsite \p CS is considered cold. bool isColdCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); + /// \brief Returns HotCountThreshold if set. + uint64_t getHotCountThreshold() { + return HotCountThreshold ? HotCountThreshold.getValue() : 0; + } + /// \brief Returns ColdCountThreshold if set. + uint64_t getColdCountThreshold() { + return ColdCountThreshold ? ColdCountThreshold.getValue() : 0; + } }; /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo. Index: lib/Transforms/IPO/PartialInlining.cpp =================================================================== --- lib/Transforms/IPO/PartialInlining.cpp +++ lib/Transforms/IPO/PartialInlining.cpp @@ -22,10 +22,13 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DominanceFrontier.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -67,17 +70,73 @@ STATISTIC(NumPartialInlined, "Number of callsites functions partially inlined into."); +STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " + "cold outlined regions were partially " + "inlined into its caller(s)."); +STATISTIC(NumColdRegionsFound, + "Number of cold single entry/exit regions found."); +STATISTIC(NumColdRegionsOutlined, + "Number of cold single entry/exit regions outlined."); // Command line option to disable partial-inlining. The default is false: static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), - cl::Hidden, cl::desc("Disable partial ininling")); + cl::Hidden, cl::desc("Disable partial inlining")); +// Command line option to disable multi-region partial-inlining. The default is +// false: +static cl::opt DisableMultiRegionPartialInline( + "disable-mr-partial-inlining", cl::init(false), cl::Hidden, + cl::desc("Disable multi-region partial inlining")); + +// Command line option to force outlining in regions with live exit variables. +// The default is false: +static cl::opt + ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, + cl::desc("Force outline regions with live exits")); + +// Command line option to disable marking outline functions with Cold Calling +// Convention. The default is false: +static cl::opt + DoNotMarkColdCC("pi-no-mark-coldcc", cl::init(false), cl::Hidden, + cl::desc("Do not outline function calls with ColdCC")); + +enum TraceLevel { + None, + Sparse, + Full, +}; +// Command line option to debug partial-inlining. The default is none: +static cl::opt PartialInliningTraceLevel( + "trace-partial-inlining", cl::init(TraceLevel::None), cl::Hidden, + cl::desc("Trace partial inlining level"), + cl::values(clEnumValN(TraceLevel::None, "none", "None"), + clEnumValN(TraceLevel::Sparse, "sparse", "Sparse"), + clEnumValN(TraceLevel::Full, "full", "Full"))); // This is an option used by testing: static cl::opt SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), cl::ZeroOrMore, cl::ReallyHidden, cl::desc("Skip Cost Analysis")); +// Used to determine if a cold region is worth outlining based on +// its inlining cost compared to the original function. Default is set at 10%. +// ie. if the cold region reduces the inlining cost of the original function by +// at least 10%. +static cl::opt MinRegionSizeRatio( + "min-region-size-ratio", cl::init(0.1), cl::Hidden, + cl::desc("Minimum ratio comparing relative sizes of each " + "outline candidate and original function")); +// Used to tune the minimum number of execution counts needed in the predecessor +// block to the cold edge. ie. confidence interval. +static cl::opt + MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, + cl::desc("Minimum block executions to consider " + "its BranchProbabilityInfo valid")); +// Used to determine when an edge is considered cold. Default is set to 10%. ie. +// if the branch probability is 10% or less, then it is deemed as 'cold'. +static cl::opt ColdBranchRatio( + "cold-branch-ratio", cl::init(0.1), cl::Hidden, + cl::desc("Minimum BranchProbability to consider a region cold.")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -104,6 +163,11 @@ namespace { +struct { + bool Sparse; + bool Full; +} TracePartialInlining; + struct FunctionOutliningInfo { FunctionOutliningInfo() = default; @@ -125,7 +189,28 @@ SmallVector ReturnBlockPreds; }; +struct FunctionOutliningMultiRegionInfo { + FunctionOutliningMultiRegionInfo() + : ORI() {} + + // Container for outline regions + struct OutlineRegionInfo { + OutlineRegionInfo(SmallVector Region, + BasicBlock *EntryBlock, BasicBlock *ExitBlock, + BasicBlock *ReturnBlock) + : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), + ReturnBlock(ReturnBlock) {} + SmallVector Region; + BasicBlock *EntryBlock; + BasicBlock *ExitBlock; + BasicBlock *ReturnBlock; + }; + + SmallVector ORI; +}; + struct PartialInlinerImpl { + PartialInlinerImpl( std::function *GetAC, std::function *GTTI, @@ -134,14 +219,24 @@ : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} bool run(Module &M); - Function *unswitchFunction(Function *F); + // Main part of the transformation that calls helper functions to find + // outlining candidates, clone & outline the function, and attempt to + // partially inline the resulting function. Returns true if + // inlining was successful, false otherwise. Also returns the outline + // function (only if we partially inlined early returns) as there is a + // possibility to further "peel" early return statements that were left in the + // outline function due to code size. + std::pair unswitchFunction(Function *F); // This class speculatively clones the the function to be partial inlined. // At the end of partial inlining, the remaining callsites to the cloned // function that are not partially inlined will be fixed up to reference // the original function, and the cloned function will be erased. struct FunctionCloner { + // Two constructors, one for single region outlining, the other for + // multi-region outlining. FunctionCloner(Function *F, FunctionOutliningInfo *OI); + FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI); ~FunctionCloner(); // Prepare for function outlining: making sure there is only @@ -149,19 +244,27 @@ // the return block. void NormalizeReturnBlock(); - // Do function outlining: - Function *doFunctionOutlining(); + // Do function outlining for cold regions. + bool doMultiRegionFunctionOutlining(); + // Do function outlining for region after early return block(s). + Function *doSingleRegionFunctionOutlining(); Function *OrigFunc = nullptr; Function *ClonedFunc = nullptr; - Function *OutlinedFunc = nullptr; - BasicBlock *OutliningCallBB = nullptr; + + typedef std::pair FuncBodyCallerPair; + // Keep track of Outlined Functions and the basic block they're called from. + SmallVector OutlinedFunctions; + // ClonedFunc is inlined in one of its callers after function // outlining. bool IsFunctionInlined = false; // The cost of the region to be outlined. int OutlinedRegionCost = 0; + // ClonedOI is specific to outlining non-early return blocks. std::unique_ptr ClonedOI = nullptr; + // ClonedOMRI is specific to outlining cold regions. + std::unique_ptr ClonedOMRI = nullptr; std::unique_ptr ClonedFuncBFI = nullptr; }; @@ -236,6 +339,8 @@ static int computeBBInlineCost(BasicBlock *BB); std::unique_ptr computeOutliningInfo(Function *F); + std::unique_ptr computeOutliningColdRegionsInfo(Function *F); + }; struct PartialInlinerLegacyPass : public ModulePass { @@ -271,12 +376,174 @@ return TTIWP->getTTI(F); }; - return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); + return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI).run(M); } }; } // end anonymous namespace +std::unique_ptr +PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) { + BasicBlock *EntryBlock = &F->front(); + + DominatorTree DT(*F); + DominanceFrontier DF; + DF.analyze(DT); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*F, LI); + std::unique_ptr ScopedBFI; + BlockFrequencyInfo *BFI; + if (!GetBFI) { + ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); + BFI = ScopedBFI.get(); + } else + BFI = &(*GetBFI)(*F); + + auto IsReturnBlock = [](BasicBlock *BB) { + TerminatorInst *TI = BB->getTerminator(); + return isa(TI); + }; + + // Return if we don't have profiling information. + if (!PSI->hasProfileSummary() || !PSI->hasInstrumentationProfile()) + return std::unique_ptr(); + + std::unique_ptr OutliningInfo = + llvm::make_unique(); + + auto IsSingleEntry = [](SmallVectorImpl &BlockList) { + BasicBlock *Dom = BlockList.front(); + return BlockList.size() > 1 && + std::distance(pred_begin(Dom), pred_end(Dom)) == 1; + }; + + auto IsSingleExit = + [IsReturnBlock](SmallVectorImpl &BlockList) -> BasicBlock * { + BasicBlock *ExitBlock = nullptr; + for (auto *Block : BlockList) { + if (IsReturnBlock(Block)) + return nullptr; + for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { + if (!is_contained(BlockList, *SI)) { + if (TracePartialInlining.Full) + dbgs() << "Found exit block = " << (*SI)->getName() << "\n"; + if (ExitBlock) + return nullptr; + else + ExitBlock = Block; + } + } + } + return ExitBlock; + }; + + auto BBProfileCount = [BFI](BasicBlock *BB) { + return BFI->getBlockProfileCount(BB) + ? BFI->getBlockProfileCount(BB).getValue() + : 0; + }; + + // Use the same computeBBInlineCost function to compute the cost savings of + // the outlining the candidate region. + int OverallFunctionCost = 0; + for (auto &BB : *F) + OverallFunctionCost += computeBBInlineCost(&BB); + + int MinOutlineRegionCost = + static_cast(OverallFunctionCost * MinRegionSizeRatio); + BranchProbability MinBranchProbability( + static_cast(ColdBranchRatio * MinBlockCounterExecution), + MinBlockCounterExecution); + bool ColdCandidateFound = false; + BasicBlock *CurrEntry = EntryBlock; + std::vector DFS; + DenseMap VisitedMap; + DFS.push_back(CurrEntry); + VisitedMap[CurrEntry] = true; + // Use Depth First Search on the basic blocks to find CFG edges that are + // considered cold. + // Cold regions considered must also have its inline cost compared to the + // overall inline cost of the original function. The region is outlined only + // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or + // more. + while (!DFS.empty()) { + auto *thisBB = DFS.back(); + DFS.pop_back(); + // Only consider regions with predecessor blocks that are considered + // not-cold (default: part of the top 99.99% of all block counters) + // AND greater than our minimum block execution count (default: 100). + if (PSI->isColdBB(thisBB, BFI) || + BBProfileCount(thisBB) < MinBlockCounterExecution) + continue; + for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { + if (VisitedMap[*SI]) + continue; + VisitedMap[*SI] = true; + DFS.push_back(*SI); + // If branch isn't cold, we skip to the next one. + BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); + if (SuccProb > MinBranchProbability) + continue; + if (TracePartialInlining.Full) { + dbgs() << "Found cold edge: " << thisBB->getName() << "->" + << (*SI)->getName() << "\n"; + dbgs() << "Branch Probability = " << SuccProb << "\n"; + } + SmallVector DominateVector; + DT.getDescendants(*SI, DominateVector); + // We can only outline single entry regions (for now). + if (!IsSingleEntry(DominateVector)) + continue; + if (TracePartialInlining.Full) + dbgs() << "Dominates more than one block. Single predecessor.\n"; + BasicBlock *ExitBlock = nullptr; + // We can only outline single exit regions (for now). + if (!(ExitBlock = IsSingleExit(DominateVector))) + continue; + int OutlineRegionCost = 0; + for (auto *BB : DominateVector) + OutlineRegionCost += computeBBInlineCost(BB); + + if (TracePartialInlining.Full) { + dbgs() << "Single Exit Block = " << ExitBlock->getName() << "\n"; + dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; + dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; + } + + if (OutlineRegionCost < MinOutlineRegionCost) { + if (TracePartialInlining.Full) { + dbgs() << "Inline cost-savings smaller than " + << MinOutlineRegionCost << ". Skip.\n"; + } + continue; + } + // For now, ignore blocks that belong to a SISE region that is a + // candidate for outlining. In the future, we may want to look + // at inner regions because the outer region may have live-exit + // variables. + for (auto *BB : DominateVector) + VisitedMap[BB] = true; + // ReturnBlock here means the block after the outline call + BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); + // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); + FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( + DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); + RegInfo.Region = DominateVector; + OutliningInfo->ORI.push_back(RegInfo); + if (TracePartialInlining.Full) { + dbgs() << "Found Cold Candidate starting at block: " + << DominateVector.front()->getName() << "\n"; + } + ColdCandidateFound = true; + NumColdRegionsFound++; + } + } + if (ColdCandidateFound) + return OutliningInfo; + else + return std::unique_ptr(); +} + std::unique_ptr PartialInlinerImpl::computeOutliningInfo(Function *F) { BasicBlock *EntryBlock = &F->front(); @@ -448,14 +715,19 @@ BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { + BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; auto EntryFreq = Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); auto OutliningCallFreq = - Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); - - auto OutlineRegionRelFreq = - BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), - EntryFreq.getFrequency()); + Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); + // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE + // we outlined any regions, so we may encounter situations where the + // OutliningCallFreq is *slightly* bigger than the EntryFreq. + if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { + OutliningCallFreq = EntryFreq; + } + auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( + OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) return OutlineRegionRelFreq; @@ -611,22 +883,25 @@ std::tuple PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { - // Now compute the cost of the call sequence to the outlined function - // 'OutlinedFunction' in BB 'OutliningCallBB': - int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); - - // Now compute the cost of the extracted/outlined function itself: - int OutlinedFunctionCost = 0; - for (BasicBlock &BB : *Cloner.OutlinedFunc) { - OutlinedFunctionCost += computeBBInlineCost(&BB); + int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; + for (auto FuncBBPair : Cloner.OutlinedFunctions) { + Function *OutlinedFunc = FuncBBPair.first; + BasicBlock* OutliningCallBB = FuncBBPair.second; + // Now compute the cost of the call sequence to the outlined function + // 'OutlinedFunction' in BB 'OutliningCallBB': + OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); + + // Now compute the cost of the extracted/outlined function itself: + for (BasicBlock &BB : *OutlinedFunc) + OutlinedFunctionCost += computeBBInlineCost(&BB); } - assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && "Outlined function cost should be no less than the outlined region"); + // The code extractor introduces a new root and exit stub blocks with // additional unconditional branches. Those branches will be eliminated // later with bb layout. The cost should be adjusted accordingly: - OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; + OutlinedFunctionCost -= 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); int OutliningRuntimeOverhead = OutliningFuncCallCost + @@ -703,6 +978,37 @@ F->replaceAllUsesWith(ClonedFunc); } +PartialInlinerImpl::FunctionCloner::FunctionCloner( + Function *F, FunctionOutliningMultiRegionInfo *OI) + : OrigFunc(F) { + ClonedOMRI = llvm::make_unique(); + + // Clone the function, so that we can hack away on it. + ValueToValueMapTy VMap; + ClonedFunc = CloneFunction(F, VMap); + + // Go through all Outline Candidate Regions and update all BasicBlock + // information. + for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : + OI->ORI) { + SmallVector Region; + for (BasicBlock *BB : RegionInfo.Region) { + Region.push_back(cast(VMap[BB])); + } + BasicBlock *NewEntryBlock = cast(VMap[RegionInfo.EntryBlock]); + BasicBlock *NewExitBlock = cast(VMap[RegionInfo.ExitBlock]); + BasicBlock *NewReturnBlock = nullptr; + if (RegionInfo.ReturnBlock) + NewReturnBlock = cast(VMap[RegionInfo.ReturnBlock]); + FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( + Region, NewEntryBlock, NewExitBlock, NewReturnBlock); + ClonedOMRI->ORI.push_back(MappedRegionInfo); + } + // Go ahead and update all uses to the duplicate, so that we can just + // use the inliner functionality when we're done hacking. + F->replaceAllUsesWith(ClonedFunc); +} + void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { auto getFirstPHI = [](BasicBlock *BB) { BasicBlock::iterator I = BB->begin(); @@ -719,6 +1025,11 @@ return FirstPhi; }; + // Shouldn't need to normalize PHIs if we're not outlining non-early return + // blocks. + if (!ClonedOI) + return; + // Special hackery is needed with PHI nodes that have inputs from more than // one extracted block. For simplicity, just split the PHIs into a two-level // sequence of PHIs, some of which will go in the extracted region, and some @@ -769,16 +1080,83 @@ DeadPhis.push_back(OldPhi); } ++I; - } - for (auto *DP : DeadPhis) - DP->eraseFromParent(); + } + for (auto *DP : DeadPhis) + DP->eraseFromParent(); - for (auto E : ClonedOI->ReturnBlockPreds) { - E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + for (auto E : ClonedOI->ReturnBlockPreds) { + E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); + } +} + +bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { + + auto ComputeRegionCost = [](SmallVectorImpl &Region) { + int Cost = 0; + for (BasicBlock* BB : Region) + Cost += computeBBInlineCost(BB); + return Cost; + }; + + assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); + + if (ClonedOMRI->ORI.empty()) + return false; + + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.recalculate(*ClonedFunc); + + // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); + + SetVector Inputs, Outputs, Sinks; + for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : ClonedOMRI->ORI) { + int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); + + CodeExtractor CE (RegionInfo.Region, &DT, /*AggregateArgs*/ false, ClonedFuncBFI.get(), &BPI); + + CE.findInputsOutputs(Inputs, Outputs, Sinks); + + if (TracePartialInlining.Full) { + dbgs() << "inputs: " << Inputs.size() << "\n"; + dbgs() << "outputs: " << Outputs.size() << "\n"; + for (Value *value : Inputs) + dbgs() << "value used in func: " << *value << "\n"; + for (Value *output : Outputs) + dbgs() << "instr used in func: " << *output << "\n"; } + + // Do not extract regions that have live exit variables. + if (Outputs.size() > 0 && !ForceLiveExit) + continue; + + Function *OutlinedFunc = CE.extractCodeRegion(); + + if (OutlinedFunc) { + CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); + BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); + assert(OutliningCallBB->getParent() == ClonedFunc); + OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); + NumColdRegionsOutlined++; + OutlinedRegionCost += CurrentOutlinedRegionCost; + + if (!DoNotMarkColdCC) { + OutlinedFunc->setCallingConv(CallingConv::Cold); + OCS.setCallingConv(CallingConv::Cold); + } + } else + if (TracePartialInlining.Sparse) + dbgs() << "CodeExtractor did not extract code region\n"; + + } + + return !OutlinedFunctions.empty(); } -Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { +Function * PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) auto ToBeInlined = [&, this](BasicBlock *BB) { @@ -787,6 +1165,16 @@ ClonedOI->Entries.end()); }; + assert(ClonedOI && "Expecting OutlineInfo for single region outline"); + // The CodeExtractor needs a dominator tree. + DominatorTree DT; + DT.recalculate(*ClonedFunc); + + // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*ClonedFunc, LI); + ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); + // Gather up the blocks that we're going to extract. std::vector ToExtract; ToExtract.push_back(ClonedOI->NonReturnBlock); @@ -802,25 +1190,18 @@ OutlinedRegionCost += computeBBInlineCost(&BB); } - // The CodeExtractor needs a dominator tree. - DominatorTree DT; - DT.recalculate(*ClonedFunc); - - // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. - LoopInfo LI(DT); - BranchProbabilityInfo BPI(*ClonedFunc, LI); - ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); - // Extract the body of the if. - OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, - ClonedFuncBFI.get(), &BPI) - .extractCodeRegion(); + Function *OutlinedFunc = + CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, + ClonedFuncBFI.get(), &BPI) + .extractCodeRegion(); if (OutlinedFunc) { - OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) + BasicBlock *OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) .getInstruction() ->getParent(); assert(OutliningCallBB->getParent() == ClonedFunc); + OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); } return OutlinedFunc; @@ -832,62 +1213,129 @@ ClonedFunc->replaceAllUsesWith(OrigFunc); ClonedFunc->eraseFromParent(); if (!IsFunctionInlined) { - // Remove the function that is speculatively created if there is no + // Remove each function that was speculatively created if there is no // reference. - if (OutlinedFunc) - OutlinedFunc->eraseFromParent(); + for (auto FuncBBPair : OutlinedFunctions) { + Function *Func = FuncBBPair.first; + Func->eraseFromParent(); + } } } -Function *PartialInlinerImpl::unswitchFunction(Function *F) { +std::pair PartialInlinerImpl::unswitchFunction(Function *F) { + if (F->hasAddressTaken()) - return nullptr; + return {false,nullptr}; // Let inliner handle it if (F->hasFnAttribute(Attribute::AlwaysInline)) - return nullptr; + return {false,nullptr}; if (F->hasFnAttribute(Attribute::NoInline)) - return nullptr; + return {false,nullptr}; if (PSI->isFunctionEntryCold(F)) - return nullptr; + return {false,nullptr}; if (F->user_begin() == F->user_end()) - return nullptr; + return {false,nullptr}; - std::unique_ptr OI = computeOutliningInfo(F); + if (TracePartialInlining.Full) { + dbgs() << ">>>>>> Original Function >>>>>>\n"; + F->print(dbgs()); + dbgs() << "<<<<<< Original Function <<<<<<\n"; + } + + // Only try to outline cold regions if we have a profile summary, which + // implies we have profiling information. + if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() && + !DisableMultiRegionPartialInline) { + std::unique_ptr OMRI = + computeOutliningColdRegionsInfo(F); + if (OMRI) { + FunctionCloner Cloner(F, OMRI.get()); + + if (TracePartialInlining.Full) { + dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; + dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() + << "\n"; + } + bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); + + if (DidOutline) { + if (TracePartialInlining.Full) { + dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; + Cloner.ClonedFunc->print(dbgs()); + dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; + } + + if (tryPartialInline(Cloner)) + return {true, nullptr}; + } + } + } + // Fall-thru to regular partial inlining if we: + // i) can't find any cold regions to outline, or + // ii) can't inline the outlined function anywhere. + std::unique_ptr OI = computeOutliningInfo(F); if (!OI) - return nullptr; + return {false, nullptr}; FunctionCloner Cloner(F, OI.get()); Cloner.NormalizeReturnBlock(); - Function *OutlinedFunction = Cloner.doFunctionOutlining(); + + if (TracePartialInlining.Full) { + dbgs() << "ClonedFunc: \n"; + Cloner.ClonedFunc->print(dbgs()); + } + + Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); + + if (OutlinedFunction) { + if (TracePartialInlining.Full) + dbgs() << "Outlined at least one region. Try Inlining.\n"; + } else { + if (TracePartialInlining.Full) + dbgs() << "Didn't outline any functions.\n"; + return {false, nullptr}; + } bool AnyInline = tryPartialInline(Cloner); if (AnyInline) - return OutlinedFunction; + return {true, OutlinedFunction}; - return nullptr; + return {false, nullptr}; } bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { - int NonWeightedRcost; - int SizeCost; - - if (Cloner.OutlinedFunc == nullptr) + if (Cloner.OutlinedFunctions.empty()) return false; + int SizeCost = 0; + BlockFrequency WeightedRcost; + int NonWeightedRcost; std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); - auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); - auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; - - // The call sequence to the outlined function is larger than the original - // outlined region size, it does not increase the chances of inlining - // the function with outlining (The inliner uses the size increase to + // Only calculate RelativeToEntryFreq when we are doing single region + // outlining. + BranchProbability RelativeToEntryFreq; + if (Cloner.ClonedOI) { + RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); + } else + // RelativeToEntryFreq doesn't make sense when we have more than one + // outlined call because each call will have a different relative frequency + // to the entry block. We can consider using the average, but the + // usefulness of that information is questionable. For now, assume we never + // execute the calls to outlined functions. + RelativeToEntryFreq = BranchProbability(0, 1); + + WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; + + // The call sequence(s) to the outlined function(s) are larger than the sum of + // the original outlined region size(s), it does not increase the chances of + // inlining the function with outlining (The inliner uses the size increase to // model the cost of inlining a callee). if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { OptimizationRemarkEmitter ORE(Cloner.OrigFunc); @@ -952,7 +1400,11 @@ AnyInline = true; NumPartialInlining++; // Update the stats - NumPartialInlined++; + if (Cloner.ClonedOI) + NumPartialInlined++; + else + NumColdOutlinePartialInlined++; + } if (AnyInline) { @@ -968,6 +1420,17 @@ if (DisablePartialInlining) return false; + switch (PartialInliningTraceLevel) { + case TraceLevel::Full: + TracePartialInlining.Full = true; + // Fall thru + case TraceLevel::Sparse: + TracePartialInlining.Sparse = true; + break; + default: + break; + } + std::vector Worklist; Worklist.reserve(M.size()); for (Function &F : M) @@ -992,11 +1455,14 @@ if (Recursive) continue; - if (Function *NewFunc = unswitchFunction(CurrFunc)) { - Worklist.push_back(NewFunc); + std::pair Result = unswitchFunction(CurrFunc); + if (Result.second) + Worklist.push_back(Result.second); + if (Result.first) Changed = true; - } } + if (TracePartialInlining.Full && Changed) + dbgs() << "Partially inlined at least one function.\n"; return Changed; } Index: test/Transforms/CodeExtractor/PartialInlinePGORegion.ll =================================================================== --- /dev/null +++ test/Transforms/CodeExtractor/PartialInlinePGORegion.ll @@ -0,0 +1,120 @@ +; RUN: opt -S -partial-inliner -min-block-execution=1 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; RUN: opt -S -passes=partial-inliner -min-block-execution=1 -skip-partial-inlining-cost-analysis < %s | FileCheck %s +; Require a dummy block (if.then.b) as successor to if.then due to PI requirement +; of region containing more than one BB. +define signext i32 @bar(i32 signext %value, i32 signext %ub) #0 !prof !30 { +entry: + %value.addr = alloca i32, align 4 + %ub.addr = alloca i32, align 4 + %sum = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %value, i32* %value.addr, align 4 + store i32 %ub, i32* %ub.addr, align 4 + store i32 0, i32* %sum, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %ub.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end, !prof !31 + +for.body: ; preds = %for.cond + %2 = load i32, i32* %value.addr, align 4 + %rem = srem i32 %2, 20 + %cmp1 = icmp eq i32 %rem, 0 + br i1 %cmp1, label %if.then, label %if.else, !prof !32 + +if.then: ; preds = %for.body + %3 = load i32, i32* %value.addr, align 4 + %4 = load i32, i32* %i, align 4 + %mul = mul nsw i32 %4, 5 + %add = add nsw i32 %3, %mul + %5 = load i32, i32* %sum, align 4 + %add2 = add nsw i32 %5, %add + store i32 %add2, i32* %sum, align 4 + br label %if.then.b + +if.then.b: ; preds = %if.then + br label %if.end + +if.else: ; preds = %for.body + %6 = load i32, i32* %value.addr, align 4 + %7 = load i32, i32* %i, align 4 + %sub = sub nsw i32 %6, %7 + %8 = load i32, i32* %sum, align 4 + %add3 = add nsw i32 %8, %sub + store i32 %add3, i32* %sum, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %for.inc + +for.inc: ; preds = %if.end + %9 = load i32, i32* %i, align 4 + %inc = add nsw i32 %9, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %10 = load i32, i32* %sum, align 4 + ret i32 %10 +} + +define signext i32 @foo(i32 signext %value, i32 signext %ub) #0 !prof !30 { +; CHECK-LABEL: @foo +; CHECK: codeRepl.i: +; CHECK-NOT: call signext i32 @bar +; CHECK: call coldcc void @bar.1_if.then +entry: + %value.addr = alloca i32, align 4 + %ub.addr = alloca i32, align 4 + store i32 %value, i32* %value.addr, align 4 + store i32 %ub, i32* %ub.addr, align 4 + %0 = load i32, i32* %value.addr, align 4 + %1 = load i32, i32* %ub.addr, align 4 + %call = call signext i32 @bar(i32 signext %0, i32 signext %1) + ret i32 %call +} + +; CHECK-LABEL: define internal coldcc void @bar.1_if.then +; CHECK: .exitStub: +; CHECK: ret void + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!29} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"PIC Level", i32 2} +!2 = !{i32 1, !"ProfileSummary", !3} +!3 = !{!4, !5, !6, !7, !8, !9, !10, !11} +!4 = !{!"ProfileFormat", !"InstrProf"} +!5 = !{!"TotalCount", i64 103} +!6 = !{!"MaxCount", i64 100} +!7 = !{!"MaxInternalCount", i64 1} +!8 = !{!"MaxFunctionCount", i64 100} +!9 = !{!"NumCounts", i64 5} +!10 = !{!"NumFunctions", i64 3} +!11 = !{!"DetailedSummary", !12} +!12 = !{!13, !14, !15, !16, !17, !18, !18, !19, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28} +!13 = !{i32 10000, i64 100, i32 1} +!14 = !{i32 100000, i64 100, i32 1} +!15 = !{i32 200000, i64 100, i32 1} +!16 = !{i32 300000, i64 100, i32 1} +!17 = !{i32 400000, i64 100, i32 1} +!18 = !{i32 500000, i64 100, i32 1} +!19 = !{i32 600000, i64 100, i32 1} +!20 = !{i32 700000, i64 100, i32 1} +!21 = !{i32 800000, i64 100, i32 1} +!22 = !{i32 900000, i64 100, i32 1} +!23 = !{i32 950000, i64 100, i32 1} +!24 = !{i32 990000, i64 1, i32 4} +!25 = !{i32 999000, i64 1, i32 4} +!26 = !{i32 999900, i64 1, i32 4} +!27 = !{i32 999990, i64 1, i32 4} +!28 = !{i32 999999, i64 1, i32 4} +!29 = !{!"clang version 6.0.0 (123456)"} +!30 = !{!"function_entry_count", i64 2} +!31 = !{!"branch_weights", i32 100, i32 1} +!32 = !{!"branch_weights", i32 0, i32 100}