Index: llvm/trunk/include/llvm/Analysis/InlineCost.h =================================================================== --- llvm/trunk/include/llvm/Analysis/InlineCost.h +++ llvm/trunk/include/llvm/Analysis/InlineCost.h @@ -20,6 +20,7 @@ namespace llvm { class AssumptionCacheTracker; +class BlockFrequencyInfo; class CallSite; class DataLayout; class Function; @@ -38,6 +39,21 @@ const unsigned TotalAllocaSizeRecursiveCaller = 1024; } +/// \brief Block frequency analysis for multiple functions. +/// This class mimics block frequency analysis on CGSCC level. Block frequency +/// info is computed on demand and cached unless they are invalidated. +class BlockFrequencyAnalysis { +private: + DenseMap BFM; + +public: + ~BlockFrequencyAnalysis(); + /// \brief Returns BlockFrequencyInfo for a function. + BlockFrequencyInfo *getBlockFrequencyInfo(Function *); + /// \brief Invalidates block frequency info for a function. + void invalidateBlockFrequencyInfo(Function *); +}; + /// \brief Represents the cost of inlining a function. /// /// This supports special values for functions which should "always" or @@ -111,7 +127,8 @@ /// inlining the callsite. It is an expensive, heavyweight call. InlineCost getInlineCost(CallSite CS, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT); + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA); /// \brief Get an InlineCost with the callee explicitly specified. /// This allows you to calculate the cost of inlining a function via a @@ -120,7 +137,8 @@ // InlineCost getInlineCost(CallSite CS, Function *Callee, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT); + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA); int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel); @@ -129,6 +147,9 @@ /// \brief Minimal filter to detect invalid constructs for inlining. bool isInlineViable(Function &Callee); + +/// \brief Return estimated count of the block \p BB. +Optional getBlockCount(BasicBlock *BB, BlockFrequencyAnalysis *BFA); } #endif Index: llvm/trunk/include/llvm/Transforms/IPO/InlinerPass.h =================================================================== --- llvm/trunk/include/llvm/Transforms/IPO/InlinerPass.h +++ llvm/trunk/include/llvm/Transforms/IPO/InlinerPass.h @@ -24,8 +24,18 @@ class CallSite; class DataLayout; class InlineCost; +class BlockFrequencyAnalysis; template class SmallPtrSet; +// Functor invoked when a block is cloned during inlining. +typedef std::function + BlockCloningFunctor; +// Functor invoked when a function is inlined inside the basic block +// containing the call. +typedef std::function FunctionCloningFunctor; +// Functor invoked when a function gets deleted during inlining. +typedef std::function FunctionDeletedFunctor; + /// Inliner - This class contains all of the helper code which is used to /// perform the inlining operations that do not depend on the policy. /// @@ -69,9 +79,28 @@ /// shouldInline - Return true if the inliner should attempt to /// inline at the given CallSite. bool shouldInline(CallSite CS); + /// Set the BFI of \p Dst to be the same as \p Src. + void copyBlockFrequency(BasicBlock *Src, BasicBlock *Dst); + /// Invalidates BFI for function \p F. + void invalidateBFI(Function *F); + /// Invalidates BFI for all functions in \p SCC. + void invalidateBFI(CallGraphSCC &SCC); + /// Update function entry count for \p Callee which has been inlined into + /// \p CallBB. + void updateEntryCount(BasicBlock *CallBB, Function *Callee); + /// \brief Update block frequency of an inlined block. + /// This method updates the block frequency of \p NewBB which is a clone of + /// \p OrigBB when the callsite \p CS gets inlined. The frequency of \p NewBB + /// is computed as follows: + /// Freq(NewBB) = Freq(OrigBB) * CallSiteFreq / CalleeEntryFreq. + void updateBlockFreq(CallSite &CS, const BasicBlock *OrigBB, + const BasicBlock *NewBB); protected: AssumptionCacheTracker *ACT; + std::unique_ptr BFA; + /// Are we using profile guided optimization? + bool HasProfileData; }; } // End llvm namespace Index: llvm/trunk/include/llvm/Transforms/Utils/Cloning.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/Cloning.h +++ llvm/trunk/include/llvm/Transforms/Utils/Cloning.h @@ -48,6 +48,9 @@ class AssumptionCacheTracker; class DominatorTree; +typedef std::function + BlockCloningFunctor; + /// Return an exact copy of the specified module /// std::unique_ptr CloneModule(const Module *M); @@ -157,7 +160,8 @@ ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, const char *NameSuffix = "", - ClonedCodeInfo *CodeInfo = nullptr); + ClonedCodeInfo *CodeInfo = nullptr, + BlockCloningFunctor Ftor = nullptr); /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, /// except that it does some simple constant prop and DCE on the fly. The @@ -172,23 +176,27 @@ /// void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, - SmallVectorImpl &Returns, + SmallVectorImpl &Returns, const char *NameSuffix = "", ClonedCodeInfo *CodeInfo = nullptr, - Instruction *TheCall = nullptr); + Instruction *TheCall = nullptr, + BlockCloningFunctor Ftor = nullptr); /// InlineFunctionInfo - This class captures the data input to the /// InlineFunction call, and records the auxiliary results produced by it. class InlineFunctionInfo { public: explicit InlineFunctionInfo(CallGraph *cg = nullptr, - AssumptionCacheTracker *ACT = nullptr) - : CG(cg), ACT(ACT) {} + AssumptionCacheTracker *ACT = nullptr, + BlockCloningFunctor Ftor = nullptr) + : CG(cg), ACT(ACT), Ftor(Ftor) {} /// CG - If non-null, InlineFunction will update the callgraph to reflect the /// changes it makes. CallGraph *CG; AssumptionCacheTracker *ACT; + // Functor that is invoked when a block is cloned into the new function. + BlockCloningFunctor Ftor; /// StaticAllocas - InlineFunction fills this in with all static allocas that /// get copied into the caller. Index: llvm/trunk/lib/Analysis/InlineCost.cpp =================================================================== --- llvm/trunk/lib/Analysis/InlineCost.cpp +++ llvm/trunk/lib/Analysis/InlineCost.cpp @@ -18,13 +18,18 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/InstVisitor.h" @@ -85,6 +90,7 @@ // easily cacheable. Instead, use the cover function paramHasAttr. CallSite CandidateCS; + BlockFrequencyAnalysis *BFA; int Threshold; int Cost; @@ -153,6 +159,8 @@ /// passed to support analyzing indirect calls whose target is inferred by /// analysis. void updateThreshold(CallSite CS, Function &Callee); + /// Adjust Threshold based on CallSiteCount and return the adjusted threshold. + int getAdjustedThreshold(int Threshold, Optional CallSiteCount); // Custom analysis routines. bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); @@ -194,17 +202,19 @@ public: CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, - Function &Callee, int Threshold, CallSite CSArg) - : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), - Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), - ExposesReturnsTwice(false), HasDynamicAlloca(false), - ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), - NumVectorInstructions(0), FiftyPercentVectorBonus(0), - TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), - NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), - NumConstantPtrDiffs(0), NumInstructionsSimplified(0), - SROACostSavings(0), SROACostSavingsLost(0) {} + Function &Callee, int Threshold, CallSite CSArg, + BlockFrequencyAnalysis *BFA) + : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), BFA(BFA), + Threshold(Threshold), Cost(0), IsCallerRecursive(false), + IsRecursiveCall(false), ExposesReturnsTwice(false), + HasDynamicAlloca(false), ContainsNoDuplicateCall(false), + HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), + AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), + FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0), + NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), + NumConstantPtrCmps(0), NumConstantPtrDiffs(0), + NumInstructionsSimplified(0), SROACostSavings(0), + SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); @@ -572,6 +582,15 @@ return false; } +// Adjust the threshold based on callsite hotness. Currently this is a nop. +int CallAnalyzer::getAdjustedThreshold(int Threshold, + Optional CallSiteCount + __attribute__((unused))) { + // FIXME: The new threshold should be computed from the given Threshold and + // the callsite hotness. + return Threshold; +} + void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // If -inline-threshold is not given, listen to the optsize and minsize // attributes when they would decrease the threshold. @@ -596,6 +615,9 @@ FunctionCount = Callee.getEntryCount().getValue(); MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue(); } + Optional CallSiteCount = + llvm::getBlockCount(CS.getInstruction()->getParent(), BFA); + Threshold = getAdjustedThreshold(Threshold, CallSiteCount); // Listen to the inlinehint attribute or profile based hotness information // when it would increase the threshold and the caller does not need to @@ -912,7 +934,8 @@ // during devirtualization and so we want to give it a hefty bonus for // inlining, but cap that bonus in the event that inlining wouldn't pan // out. Pretend to inline the function, with a custom threshold. - CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); + CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS, + BFA); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -1433,9 +1456,10 @@ InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT) { + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA) { return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI, - ACT); + ACT, BFA); } int llvm::computeThresholdFromOptLevels(unsigned OptLevel, @@ -1454,7 +1478,8 @@ InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT) { + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA) { // Cannot inline indirect calls. if (!Callee) @@ -1487,7 +1512,7 @@ DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS); + CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS, BFA); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); @@ -1535,3 +1560,45 @@ return true; } + +/// \brief Get estimated execution count for \p BB. +Optional llvm::getBlockCount(BasicBlock *BB, + BlockFrequencyAnalysis *BFA) { + if (!BFA) + return None; + Function *F = BB->getParent(); + Optional EntryCount = F->getEntryCount(); + if (!EntryCount) + return None; + BlockFrequencyInfo *BFI = BFA->getBlockFrequencyInfo(F); + uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency(); + uint64_t FunctionEntryFreq = BFI->getEntryFreq(); + uint64_t BBCount = EntryCount.getValue() * BBFreq / FunctionEntryFreq; + return BBCount; +} + +BlockFrequencyAnalysis::~BlockFrequencyAnalysis() { + for (auto &Entry : BFM) { + delete Entry.second; + } +} + +/// \brief Get BlockFrequencyInfo for a function. +BlockFrequencyInfo *BlockFrequencyAnalysis::getBlockFrequencyInfo(Function *F) { + auto Iter = BFM.find(F); + if (Iter != BFM.end()) + return Iter->second; + // We need to create a BlockFrequencyInfo object for F and store it. + DominatorTree DT; + DT.recalculate(*F); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*F, LI); + BlockFrequencyInfo *BFI = new BlockFrequencyInfo(*F, BPI, LI); + BFM[F] = BFI; + return BFI; +} + +/// \brief Invalidate BlockFrequencyInfo for a function. +void BlockFrequencyAnalysis::invalidateBlockFrequencyInfo(Function *F) { + BFM.erase(F); +} Index: llvm/trunk/lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/InlineSimple.cpp +++ llvm/trunk/lib/Transforms/IPO/InlineSimple.cpp @@ -59,7 +59,8 @@ InlineCost getInlineCost(CallSite CS) override { Function *Callee = CS.getCalledFunction(); TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); - return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT); + return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT, + HasProfileData ? BFA.get() : nullptr); } bool runOnSCC(CallGraphSCC &SCC) override; Index: llvm/trunk/lib/Transforms/IPO/Inliner.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/Inliner.cpp +++ llvm/trunk/lib/Transforms/IPO/Inliner.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -47,10 +48,13 @@ // if those would be more profitable and blocked inline steps. STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed"); -Inliner::Inliner(char &ID) : CallGraphSCCPass(ID), InsertLifetime(true) {} +Inliner::Inliner(char &ID) + : CallGraphSCCPass(ID), InsertLifetime(true), + BFA(new BlockFrequencyAnalysis()) {} Inliner::Inliner(char &ID, bool InsertLifetime) - : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {} + : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime), + BFA(new BlockFrequencyAnalysis()) {} /// For this class, we declare that we require and preserve the call graph. /// If the derived class implements this method, it should @@ -259,7 +263,7 @@ Twine(IC.getCostDelta() + IC.getCost()) + ")"); return false; } - + // Try to detect the case where the current inlining candidate caller (call // it B) is a static or linkonce-ODR function and is an inlining candidate // elsewhere, and the current candidate callee (call it C) is large enough @@ -356,8 +360,90 @@ return false; } +/// \brief Update the frequency of a block that is cloned into the caller. +/// This is invoked when \p OrigBB from the callee is cloned into \p NewBB in +/// the caller. +void Inliner::updateBlockFreq(CallSite &CS, const BasicBlock *OrigBB, + const BasicBlock *NewBB) { + if (!HasProfileData) + return; + Instruction *Call = CS.getInstruction(); + BasicBlock *CallBB = Call->getParent(); + BlockFrequencyInfo *CalleeBFI = + BFA->getBlockFrequencyInfo(CS.getCalledFunction()); + BlockFrequencyInfo *CallerBFI = + BFA->getBlockFrequencyInfo(CallBB->getParent()); + // Find the number of times OrigBB is executed per invocation of the callee + // and multiply by the number of times callee is executed in the caller. + // Freq(NewBB) = Freq(OrigBB) * CallSiteFreq / CalleeEntryFreq. + uint64_t CallSiteFreq = CallerBFI->getBlockFreq(CallBB).getFrequency(); + uint64_t CalleeEntryFreq = CalleeBFI->getEntryFreq(); + // Frequency of OrigBB in the callee. + BlockFrequency OrigBBFreq = CalleeBFI->getBlockFreq(OrigBB); + CallerBFI->setBlockFreq(NewBB, (double)(OrigBBFreq.getFrequency()) / + CalleeEntryFreq * CallSiteFreq); +} + +/// \brief Update entry count of \p Callee after it got inlined at a callsite +/// in block \p CallBB. +void Inliner::updateEntryCount(BasicBlock *CallBB, Function *Callee) { + if (!HasProfileData) + return; + // If the callee has a original count of N, and the estimated count of + // callsite is M, the new callee count is set to N - M. M is estimated from + // the caller's entry count, its entry block frequency and the block frequency + // of the callsite. + Optional CalleeCount = Callee->getEntryCount(); + if (!CalleeCount) + return; + Optional CallSiteCount = llvm::getBlockCount(CallBB, BFA.get()); + if (!CallSiteCount) + return; + // Since CallSiteCount is an estimate, it could exceed the original callee + // count and has to be set to 0. + if (CallSiteCount.getValue() > CalleeCount.getValue()) { + Callee->setEntryCount(0); + DEBUG(llvm::dbgs() << "Estimated count of block " << CallBB->getName() + << " is " << CallSiteCount.getValue() + << " which exceeds the entry count " + << CalleeCount.getValue() << " of the callee " + << Callee->getName() << "\n"); + } else + Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue()); +} + +void Inliner::invalidateBFI(Function *F) { + if (!HasProfileData) + return; + if (F) + BFA->invalidateBlockFrequencyInfo(F); +} +void Inliner::invalidateBFI(CallGraphSCC &SCC) { + if (!HasProfileData) + return; + for (CallGraphNode *Node : SCC) { + Function *F = Node->getFunction(); + invalidateBFI(F); + } +} +void Inliner::copyBlockFrequency(BasicBlock *Src, BasicBlock *Dst) { + if (!HasProfileData) + return; + Function *F = Src->getParent(); + BlockFrequencyInfo *BFI = BFA->getBlockFrequencyInfo(F); + BFI->setBlockFreq(Dst, BFI->getBlockFreq(Src).getFrequency()); +} + +static bool hasProfileData(Module &M) { + // We check for the presence of MaxFunctionCount in the module. + // FIXME: This now only works for frontend based instrumentation. + return M.getMaximumFunctionCount().hasValue(); +} + bool Inliner::runOnSCC(CallGraphSCC &SCC) { + using namespace std::placeholders; CallGraph &CG = getAnalysis().getCallGraph(); + HasProfileData = hasProfileData(CG.getModule()); ACT = &getAnalysis(); auto &TLI = getAnalysis().getTLI(); @@ -419,7 +505,6 @@ InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, ACT); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -448,6 +533,10 @@ CS.getInstruction()->eraseFromParent(); ++NumCallsDeleted; } else { + Instruction *TheCall = CS.getInstruction(); + BasicBlock *CallSiteBlock = TheCall->getParent(); + Instruction *CallSuccessor = &*(++BasicBlock::iterator(TheCall)); + // We can only inline direct calls to non-declarations. if (!Callee || Callee->isDeclaration()) continue; @@ -476,6 +565,11 @@ continue; } + BlockCloningFunctor BCF = nullptr; + if (HasProfileData) + BCF = std::bind(&Inliner::updateBlockFreq, this, CS, _1, _2); + InlineFunctionInfo InlineInfo(&CG, ACT, BCF); + // Attempt to inline the function. if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID, InsertLifetime)) { @@ -485,6 +579,13 @@ Caller->getName())); continue; } + updateEntryCount(CallSiteBlock, Callee); + // The instruction following the call is part of a new basic block + // created during the inlining process. This does not have an entry in + // the BFI. We create an entry by copying the frequency of the original + // block containing the call. + copyBlockFrequency(CallSiteBlock, CallSuccessor->getParent()); + ++NumInlined; // Report the inline decision. @@ -523,7 +624,9 @@ CalleeNode->removeAllCalledFunctions(); // Removing the node for callee from the call graph and delete it. - delete CG.removeFunctionFromModule(CalleeNode); + Function *F = CG.removeFunctionFromModule(CalleeNode); + invalidateBFI(F); + delete F; ++NumDeleted; } @@ -544,6 +647,7 @@ } } while (LocalChange); + invalidateBFI(SCC); return Changed; } @@ -651,7 +755,9 @@ FunctionsToRemove.end()), FunctionsToRemove.end()); for (CallGraphNode *CGN : FunctionsToRemove) { - delete CG.removeFunctionFromModule(CGN); + Function *F = CG.removeFunctionFromModule(CGN); + invalidateBFI(F); + delete F; ++NumDeleted; } return true; Index: llvm/trunk/lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/CloneFunction.cpp +++ llvm/trunk/lib/Transforms/Utils/CloneFunction.cpp @@ -277,9 +277,10 @@ /// The specified block is found to be reachable, clone it and /// anything that it can reach. - void CloneBlock(const BasicBlock *BB, + void CloneBlock(const BasicBlock *BB, BasicBlock::const_iterator StartingInst, - std::vector &ToClone); + std::vector &ToClone, + BlockCloningFunctor Ftor = nullptr); }; } @@ -287,7 +288,8 @@ /// anything that it can reach. void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, BasicBlock::const_iterator StartingInst, - std::vector &ToClone){ + std::vector &ToClone, + BlockCloningFunctor Ftor) { WeakVH &BBEntry = VMap[BB]; // Have we already cloned this block? @@ -424,18 +426,19 @@ CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->front(); } + // Call Ftor to tell BB has been cloned to NewBB + if (Ftor) + Ftor(BB, NewBB); } /// This works like CloneAndPruneFunctionInto, except that it does not clone the /// entire function. Instead it starts at an instruction provided by the caller /// and copies (and prunes) only the code reachable from that instruction. -void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, - const Instruction *StartingInst, - ValueToValueMapTy &VMap, - bool ModuleLevelChanges, - SmallVectorImpl &Returns, - const char *NameSuffix, - ClonedCodeInfo *CodeInfo) { +void llvm::CloneAndPruneIntoFromInst( + Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst, + ValueToValueMapTy &VMap, bool ModuleLevelChanges, + SmallVectorImpl &Returns, const char *NameSuffix, + ClonedCodeInfo *CodeInfo, BlockCloningFunctor Ftor) { assert(NameSuffix && "NameSuffix cannot be null!"); ValueMapTypeRemapper *TypeMapper = nullptr; @@ -461,11 +464,11 @@ // Clone the entry block, and anything recursively reachable from it. std::vector CloneWorklist; - PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); + PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist, Ftor); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); - PFC.CloneBlock(BB, BB->begin(), CloneWorklist); + PFC.CloneBlock(BB, BB->begin(), CloneWorklist, Ftor); } // Loop over all of the basic blocks in the old function. If the block was @@ -667,15 +670,14 @@ /// constant arguments cause a significant amount of code in the callee to be /// dead. Since this doesn't produce an exact copy of the input, it can't be /// used for things like CloneFunction or CloneModule. -void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, - ValueToValueMapTy &VMap, - bool ModuleLevelChanges, - SmallVectorImpl &Returns, - const char *NameSuffix, - ClonedCodeInfo *CodeInfo, - Instruction *TheCall) { +void llvm::CloneAndPruneFunctionInto( + Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, + bool ModuleLevelChanges, SmallVectorImpl &Returns, + const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall, + BlockCloningFunctor Ftor) { CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, - ModuleLevelChanges, Returns, NameSuffix, CodeInfo); + ModuleLevelChanges, Returns, NameSuffix, CodeInfo, + Ftor); } /// \brief Remaps instructions in \p Blocks using the mapping in \p VMap. Index: llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp +++ llvm/trunk/lib/Transforms/Utils/InlineFunction.cpp @@ -1319,7 +1319,7 @@ // If IFI has any state in it, zap it before we fill it in. IFI.reset(); - + const Function *CalledFunc = CS.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! @@ -1486,7 +1486,7 @@ // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", - &InlinedFunctionInfo, TheCall); + &InlinedFunctionInfo, TheCall, IFI.Ftor); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; Index: llvm/trunk/test/Transforms/Inline/function-count-update-2.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/function-count-update-2.ll +++ llvm/trunk/test/Transforms/Inline/function-count-update-2.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -inline -S | FileCheck %s + +; This tests that the function count of a callee gets correctly updated after it +; has been inlined into a two callsites. + +; CHECK: @callee() !prof [[COUNT:![0-9]+]] +define i32 @callee() !prof !1 { + ret i32 0 +} + +define i32 @caller1() !prof !2 { + %i = call i32 @callee() + ret i32 %i +} + +define i32 @caller2() !prof !3 { + %i = call i32 @callee() + ret i32 %i +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 600} +!3 = !{!"function_entry_count", i64 400} + Index: llvm/trunk/test/Transforms/Inline/function-count-update-3.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/function-count-update-3.ll +++ llvm/trunk/test/Transforms/Inline/function-count-update-3.ll @@ -0,0 +1,69 @@ +; RUN: opt < %s -inline -S -inline-threshold=50 | FileCheck %s + +; This tests that the function count of a function gets properly scaled after +; inlining a call chain leading to the function. +; Function a calls c with count 200 (C1) +; Function b calls c with count 300 +; Function c calls e with count 250 (C2) +; Entry count of e is 500 (C3) +; c->e inlining does not happen since the cost exceeds threshold. +; c then inlined into a. +; e now gets inlined into a (through c) since the branch condition in e is now +; known and hence the cost gets reduced. +; Estimated count of a->e callsite = C2 * (C1 / C3) +; Estimated count of a->e callsite = 250 * (200 / 500) = 100 +; Remaining count of e = C3 - 100 = 500 - 100 = 400 + +@data = external global i32 + +define i32 @a(i32 %a1) !prof !1 { + %a2 = call i32 @c(i32 %a1, i32 1) + ret i32 %a2 +} + +define i32 @b(i32 %b1) !prof !2 { + %b2 = call i32 @c(i32 %b1, i32 %b1) + ret i32 %b2 +} + +define i32 @c(i32 %c1, i32 %c100) !prof !3 { + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + ret i32 0 + +cond_true: + %c11 = call i32 @e(i32 %c100) + ret i32 %c11 +} + +; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]] +define i32 @e(i32 %c1) !prof !4 { + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + %c2 = load i32, i32* @data, align 4 + %c3 = add i32 %c1, %c2 + %c4 = mul i32 %c3, %c2 + %c5 = add i32 %c4, %c2 + %c6 = mul i32 %c5, %c2 + %c7 = add i32 %c6, %c2 + %c8 = mul i32 %c7, %c2 + %c9 = add i32 %c8, %c2 + %c10 = mul i32 %c9, %c2 + ret i32 %c10 + +cond_true: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400} +!0 = !{i32 1, !"MaxFunctionCount", i32 5000} +!1 = !{!"function_entry_count", i64 200} +!2 = !{!"function_entry_count", i64 300} +!3 = !{!"function_entry_count", i64 500} +!4 = !{!"function_entry_count", i64 500} + Index: llvm/trunk/test/Transforms/Inline/function-count-update.ll =================================================================== --- llvm/trunk/test/Transforms/Inline/function-count-update.ll +++ llvm/trunk/test/Transforms/Inline/function-count-update.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -inline -S | FileCheck %s +; RUN: opt < %s -always-inline -S | FileCheck %s + +; This tests that the function count of two callees get correctly updated after +; they have been inlined into two back-to-back callsites in a single basic block +; in the caller. The callees have the alwaysinline attribute and so they get +; inlined both with the regular inliner pass and the always inline pass. In +; both cases, the new count of each callee is the original count minus callsite +; count which is 200 (since the caller's entry count is 400 and the block +; containing the calls have a relative block frequency of 0.5). + +; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]] +define i32 @callee1(i32 %n) #0 !prof !1 { + %cond = icmp sle i32 %n, 10 + br i1 %cond, label %cond_true, label %cond_false + +cond_true: + %r1 = add i32 %n, 1 + ret i32 %r1 +cond_false: + %r2 = add i32 %n, 2 + ret i32 %r2 +} + +; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]] +define i32 @callee2(i32 %n) #0 !prof !2 { + %r1 = add i32 %n, 1 + ret i32 %r1 +} + +define i32 @caller(i32 %n) !prof !3 { + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false + +cond_true: + %i = call i32 @callee1(i32 %n) + %j = call i32 @callee2(i32 %i) + ret i32 %j +cond_false: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800} +; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 2000} +!3 = !{!"function_entry_count", i64 400} +attributes #0 = { alwaysinline } +