Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -20,6 +20,7 @@ namespace llvm { class AssumptionCacheTracker; +class BlockFrequencyInfo; class CallSite; class DataLayout; class Function; @@ -38,6 +39,21 @@ const unsigned TotalAllocaSizeRecursiveCaller = 1024; } +/// \brief Block frequency analysis for multiple functions. +/// This class mimics block frequency analysis on CGSCC level. Block frequency +/// info is computed on demand and cached unless they are invalidated. +class BlockFrequencyAnalysis { +private: + DenseMap BFM; + +public: + ~BlockFrequencyAnalysis(); + /// \brief Returns BlockFrequencyInfo for a function. + BlockFrequencyInfo *getBlockFrequencyInfo(Function *); + /// \brief Invalidates block frequency info for a function. + void invalidateBlockFrequencyInfo(Function *); +}; + /// \brief Represents the cost of inlining a function. /// /// This supports special values for functions which should "always" or @@ -111,7 +127,8 @@ /// inlining the callsite. It is an expensive, heavyweight call. InlineCost getInlineCost(CallSite CS, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT); + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA); /// \brief Get an InlineCost with the callee explicitly specified. /// This allows you to calculate the cost of inlining a function via a @@ -120,7 +137,8 @@ // InlineCost getInlineCost(CallSite CS, Function *Callee, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT); + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA); int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel); @@ -129,6 +147,9 @@ /// \brief Minimal filter to detect invalid constructs for inlining. bool isInlineViable(Function &Callee); + +/// \brief Return estimated count of the block \p BB. +Optional getBlockCount(BasicBlock *BB, BlockFrequencyAnalysis *BFA); } #endif Index: include/llvm/Transforms/IPO/InlinerPass.h =================================================================== --- include/llvm/Transforms/IPO/InlinerPass.h +++ include/llvm/Transforms/IPO/InlinerPass.h @@ -26,6 +26,15 @@ class InlineCost; template class SmallPtrSet; +// Functor invoked when a block is cloned during inlining. +typedef std::function + BlockCloningFunctor; +// Functor invoked when a function is inlined inside the basic block +// containing the call. +typedef std::function FunctionCloningFunctor; +// Functor invoked when a function gets deleted during inlining. +typedef std::function FunctionDeletedFunctor; + /// Inliner - This class contains all of the helper code which is used to /// perform the inlining operations that do not depend on the policy. /// @@ -72,6 +81,13 @@ protected: AssumptionCacheTracker *ACT; + // The getXXXFunctor methods return a nullptr. Classes that inherit + // Inliner override them to update analysis information. + virtual BlockCloningFunctor getBlockCloningFunctor(CallSite &CS) { + return nullptr; + } + virtual FunctionCloningFunctor getFunctionCloningFunctor() { return nullptr; } + virtual FunctionDeletedFunctor getFunctionDeletedFunctor() { return nullptr; } }; } // End llvm namespace Index: include/llvm/Transforms/Utils/Cloning.h =================================================================== --- include/llvm/Transforms/Utils/Cloning.h +++ include/llvm/Transforms/Utils/Cloning.h @@ -48,6 +48,9 @@ class AssumptionCacheTracker; class DominatorTree; +typedef std::function + BlockCloningFunctor; + /// Return an exact copy of the specified module /// std::unique_ptr CloneModule(const Module *M); @@ -152,7 +155,8 @@ ValueToValueMapTy &VMap, bool ModuleLevelChanges, SmallVectorImpl &Returns, const char *NameSuffix = "", - ClonedCodeInfo *CodeInfo = nullptr); + ClonedCodeInfo *CodeInfo = nullptr, + BlockCloningFunctor Ftor = nullptr); /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto, /// except that it does some simple constant prop and DCE on the fly. The @@ -167,23 +171,27 @@ /// void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, bool ModuleLevelChanges, - SmallVectorImpl &Returns, + SmallVectorImpl &Returns, const char *NameSuffix = "", ClonedCodeInfo *CodeInfo = nullptr, - Instruction *TheCall = nullptr); + Instruction *TheCall = nullptr, + BlockCloningFunctor Ftor = nullptr); /// InlineFunctionInfo - This class captures the data input to the /// InlineFunction call, and records the auxiliary results produced by it. class InlineFunctionInfo { public: explicit InlineFunctionInfo(CallGraph *cg = nullptr, - AssumptionCacheTracker *ACT = nullptr) - : CG(cg), ACT(ACT) {} + AssumptionCacheTracker *ACT = nullptr, + BlockCloningFunctor Ftor = nullptr) + : CG(cg), ACT(ACT), Ftor(Ftor) {} /// CG - If non-null, InlineFunction will update the callgraph to reflect the /// changes it makes. CallGraph *CG; AssumptionCacheTracker *ACT; + // Functor that is invoked when a block is cloned into the new function. + BlockCloningFunctor Ftor; /// StaticAllocas - InlineFunction fills this in with all static allocas that /// get copied into the caller. Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -18,13 +18,18 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/InstVisitor.h" @@ -85,6 +90,7 @@ // easily cacheable. Instead, use the cover function paramHasAttr. CallSite CandidateCS; + BlockFrequencyAnalysis *BFA; int Threshold; int Cost; @@ -153,6 +159,8 @@ /// passed to support analyzing indirect calls whose target is inferred by /// analysis. void updateThreshold(CallSite CS, Function &Callee); + /// Adjust Threshold based on CallSiteCount and return the adjusted threshold. + int getAdjustedThreshold(int Threshold, Optional CallSiteCount); // Custom analysis routines. bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); @@ -194,17 +202,19 @@ public: CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, - Function &Callee, int Threshold, CallSite CSArg) - : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), - Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), - ExposesReturnsTwice(false), HasDynamicAlloca(false), - ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), - NumVectorInstructions(0), FiftyPercentVectorBonus(0), - TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), - NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), - NumConstantPtrDiffs(0), NumInstructionsSimplified(0), - SROACostSavings(0), SROACostSavingsLost(0) {} + Function &Callee, int Threshold, CallSite CSArg, + BlockFrequencyAnalysis *BFA) + : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), BFA(BFA), + Threshold(Threshold), Cost(0), IsCallerRecursive(false), + IsRecursiveCall(false), ExposesReturnsTwice(false), + HasDynamicAlloca(false), ContainsNoDuplicateCall(false), + HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), + AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), + FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0), + NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), + NumConstantPtrCmps(0), NumConstantPtrDiffs(0), + NumInstructionsSimplified(0), SROACostSavings(0), + SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); @@ -572,6 +582,15 @@ return false; } +// Adjust the threshold based on callsite hotness. Currently this is a nop. +int CallAnalyzer::getAdjustedThreshold(int Threshold, + Optional CallSiteCount + __attribute__((unused))) { + // FIXME: The new threshold should be computed from the given Threshold and + // the callsite hotness. + return Threshold; +} + void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { // If -inline-threshold is not given, listen to the optsize and minsize // attributes when they would decrease the threshold. @@ -596,6 +615,10 @@ FunctionCount = Callee.getEntryCount().getValue(); MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue(); } + Instruction *Call = CS.getInstruction(); + BasicBlock *CallBB = Call->getParent(); + Optional CallSiteCount = llvm::getBlockCount(CallBB, BFA); + Threshold = getAdjustedThreshold(Threshold, CallSiteCount); // Listen to the inlinehint attribute or profile based hotness information // when it would increase the threshold and the caller does not need to @@ -912,7 +935,8 @@ // during devirtualization and so we want to give it a hefty bonus for // inlining, but cap that bonus in the event that inlining wouldn't pan // out. Pretend to inline the function, with a custom threshold. - CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); + CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS, + BFA); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -1422,9 +1446,10 @@ InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT) { + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA) { return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI, - ACT); + ACT, BFA); } int llvm::computeThresholdFromOptLevels(unsigned OptLevel, @@ -1443,7 +1468,8 @@ InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, int DefaultThreshold, TargetTransformInfo &CalleeTTI, - AssumptionCacheTracker *ACT) { + AssumptionCacheTracker *ACT, + BlockFrequencyAnalysis *BFA) { // Cannot inline indirect calls. if (!Callee) @@ -1476,7 +1502,7 @@ DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS); + CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS, BFA); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); @@ -1524,3 +1550,43 @@ return true; } + +/// \brief Get estimated execution count for \p BB. +Optional llvm::getBlockCount(BasicBlock *BB, + BlockFrequencyAnalysis *BFA) { + Function *F = BB->getParent(); + Optional EntryCount = F->getEntryCount(); + if (!EntryCount) + return None; + BlockFrequencyInfo *BFI = BFA->getBlockFrequencyInfo(F); + uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency(); + uint64_t FunctionEntryFreq = BFI->getEntryFreq(); + uint64_t BBCount = EntryCount.getValue() * BBFreq / FunctionEntryFreq; + return BBCount; +} + +BlockFrequencyAnalysis::~BlockFrequencyAnalysis() { + for (auto &Entry : BFM) { + delete Entry.second; + } +} + +/// \brief Get BlockFrequencyInfo for a function. +BlockFrequencyInfo *BlockFrequencyAnalysis::getBlockFrequencyInfo(Function *F) { + auto Iter = BFM.find(F); + if (Iter != BFM.end()) + return Iter->second; + // We need to create a BlockFrequencyInfo object for F and store it. + DominatorTree DT; + DT.recalculate(*F); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*F, LI); + BlockFrequencyInfo *BFI = new BlockFrequencyInfo(*F, BPI, LI); + BFM[F] = BFI; + return BFI; +} + +/// \brief Invalidate BlockFrequencyInfo for a function. +void BlockFrequencyAnalysis::invalidateBlockFrequencyInfo(Function *F) { + BFM.erase(F); +} Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -23,10 +24,14 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/InlinerPass.h" using namespace llvm; +using namespace std::placeholders; #define DEBUG_TYPE "inline" @@ -43,14 +48,42 @@ // --inline-threshold flag, // user specified value. int DefaultThreshold; + std::unique_ptr BFA; + void InvalidateAnalysis(CallGraphSCC &SCC); + void InvalidateFunctionAnalysis(Function *F); + void UpdateEntryCount(BasicBlock *CallBB, Function *Callee); + void UpdateBlockFreq(CallSite &CS, const BasicBlock *OrigBB, + const BasicBlock *NewBB); + +protected: + BlockCloningFunctor getBlockCloningFunctor(CallSite &CS) override { + BlockCloningFunctor BCF = + std::bind(&SimpleInliner::UpdateBlockFreq, this, CS, _1, _2); + return BCF; + } + + FunctionCloningFunctor getFunctionCloningFunctor() override { + FunctionCloningFunctor FCF = + std::bind(&SimpleInliner::UpdateEntryCount, this, _1, _2); + return FCF; + } + + FunctionDeletedFunctor getFunctionDeletedFunctor() override { + FunctionDeletedFunctor FDF = + std::bind(&SimpleInliner::InvalidateFunctionAnalysis, this, _1); + return FDF; + } public: SimpleInliner() - : Inliner(ID), DefaultThreshold(llvm::getDefaultInlineThreshold()) { + : Inliner(ID), DefaultThreshold(llvm::getDefaultInlineThreshold()), + BFA(new BlockFrequencyAnalysis()) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } - SimpleInliner(int Threshold) : Inliner(ID), DefaultThreshold(Threshold) { + SimpleInliner(int Threshold) + : Inliner(ID), DefaultThreshold(Threshold), + BFA(new BlockFrequencyAnalysis()) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } @@ -59,7 +92,7 @@ InlineCost getInlineCost(CallSite CS) override { Function *Callee = CS.getCalledFunction(); TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); - return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT); + return llvm::getInlineCost(CS, DefaultThreshold, TTI, ACT, BFA.get()); } bool runOnSCC(CallGraphSCC &SCC) override; @@ -95,10 +128,72 @@ bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { TTIWP = &getAnalysis(); - return Inliner::runOnSCC(SCC); + bool RetVal = Inliner::runOnSCC(SCC); + InvalidateAnalysis(SCC); + return RetVal; } void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); Inliner::getAnalysisUsage(AU); } + +/// \brief Update the frequency of a block that is cloned into the caller. +/// This is invoked when \p OrigBB from the callee is cloned into \p NewBB in +/// the caller. +void SimpleInliner::UpdateBlockFreq(CallSite &CS, const BasicBlock *OrigBB, + const BasicBlock *NewBB) { + Instruction *Call = CS.getInstruction(); + BasicBlock *CallBB = Call->getParent(); + BlockFrequencyInfo *CalleeBFI = + BFA->getBlockFrequencyInfo(CS.getCalledFunction()); + BlockFrequencyInfo *CallerBFI = + BFA->getBlockFrequencyInfo(CallBB->getParent()); + // Find the number of times OrigBB is executed per invocation of the callee + // and multiply by the number of times callee is executed in the caller. + // Freq(NewBB) = Freq(OrigBB) * CallSiteFreq / CalleeEntryFreq. + uint64_t CallSiteFreq = CallerBFI->getBlockFreq(CallBB).getFrequency(); + uint64_t CalleeEntryFreq = CalleeBFI->getEntryFreq(); + // Frequency of OrigBB in the callee. + BlockFrequency OrigBBFreq = CalleeBFI->getBlockFreq(OrigBB); + CallerBFI->setBlockFreq(NewBB, (double)(OrigBBFreq.getFrequency()) / + CalleeEntryFreq * CallSiteFreq); +} + +/// \brief Update entry count of \p Callee after it got inlined at a callsite +/// in block \p CallBB. +void SimpleInliner::UpdateEntryCount(BasicBlock *CallBB, Function *Callee) { + // If the callee has a original count of N, and the estimated count of + // callsite is M, the new callee count is set to N - M. M is estimated from + // the caller's entry count, its entry block frequency and the block frequency + // of the callsite. + Optional CalleeCount = Callee->getEntryCount(); + if (!CalleeCount) + return; + Optional CallSiteCount = llvm::getBlockCount(CallBB, BFA.get()); + if (!CallSiteCount) + return; + // Since CallSiteCount is an estimate, it could exceed the original callee + // count and has to be set to 0. + if (CallSiteCount.getValue() > CalleeCount.getValue()) { + Callee->setEntryCount(0); + DEBUG(llvm::dbgs() << "Estimated count of block " << CallBB->getName() + << " is " << CallSiteCount.getValue() + << " which exceeds the entry count " + << CalleeCount.getValue() << " of the callee " + << Callee->getName() << "\n"); + } else + Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue()); +} + +void SimpleInliner::InvalidateAnalysis(CallGraphSCC &SCC) { + for (CallGraphNode *Node : SCC) { + Function *F = Node->getFunction(); + InvalidateFunctionAnalysis(F); + } +} +void SimpleInliner::InvalidateFunctionAnalysis(Function *F) { + if (F) { + BFA->invalidateBlockFrequencyInfo(F); + } +} Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -258,7 +258,7 @@ Twine(IC.getCostDelta() + IC.getCost()) + ")"); return false; } - + // Try to detect the case where the current inlining candidate caller (call // it B) is a static or linkonce-ODR function and is an inlining candidate // elsewhere, and the current candidate callee (call it C) is large enough @@ -418,7 +418,6 @@ InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, ACT); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -447,6 +446,7 @@ CS.getInstruction()->eraseFromParent(); ++NumCallsDeleted; } else { + BasicBlock *CallSiteBlock = CS.getInstruction()->getParent(); // We can only inline direct calls to non-declarations. if (!Callee || Callee->isDeclaration()) continue; @@ -474,6 +474,7 @@ Caller->getName())); continue; } + InlineFunctionInfo InlineInfo(&CG, ACT, getBlockCloningFunctor(CS)); // Attempt to inline the function. if (!InlineCallIfPossible(*this, CS, InlineInfo, InlinedArrayAllocas, @@ -484,6 +485,10 @@ Caller->getName())); continue; } + FunctionCloningFunctor FCF = getFunctionCloningFunctor(); + if (FCF) + FCF(CallSiteBlock, Callee); + ++NumInlined; // Report the inline decision. @@ -522,7 +527,11 @@ CalleeNode->removeAllCalledFunctions(); // Removing the node for callee from the call graph and delete it. - delete CG.removeFunctionFromModule(CalleeNode); + FunctionDeletedFunctor FDF = getFunctionDeletedFunctor(); + Function *F = CG.removeFunctionFromModule(CalleeNode); + if (FDF) + FDF(F); + delete F; ++NumDeleted; } @@ -649,8 +658,12 @@ FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()), FunctionsToRemove.end()); + FunctionDeletedFunctor FDF = getFunctionDeletedFunctor(); for (CallGraphNode *CGN : FunctionsToRemove) { - delete CG.removeFunctionFromModule(CGN); + Function *F = CG.removeFunctionFromModule(CGN); + if (FDF) + FDF(F); + delete F; ++NumDeleted; } return true; Index: lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- lib/Transforms/Utils/CloneFunction.cpp +++ lib/Transforms/Utils/CloneFunction.cpp @@ -277,9 +277,10 @@ /// The specified block is found to be reachable, clone it and /// anything that it can reach. - void CloneBlock(const BasicBlock *BB, + void CloneBlock(const BasicBlock *BB, BasicBlock::const_iterator StartingInst, - std::vector &ToClone); + std::vector &ToClone, + BlockCloningFunctor Ftor = nullptr); }; } @@ -287,7 +288,8 @@ /// anything that it can reach. void PruningFunctionCloner::CloneBlock(const BasicBlock *BB, BasicBlock::const_iterator StartingInst, - std::vector &ToClone){ + std::vector &ToClone, + BlockCloningFunctor Ftor) { WeakVH &BBEntry = VMap[BB]; // Have we already cloned this block? @@ -424,18 +426,19 @@ CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && BB != &BB->getParent()->front(); } + // Call Ftor to tell BB has been cloned to NewBB + if (Ftor) + Ftor(BB, NewBB); } /// This works like CloneAndPruneFunctionInto, except that it does not clone the /// entire function. Instead it starts at an instruction provided by the caller /// and copies (and prunes) only the code reachable from that instruction. -void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, - const Instruction *StartingInst, - ValueToValueMapTy &VMap, - bool ModuleLevelChanges, - SmallVectorImpl &Returns, - const char *NameSuffix, - ClonedCodeInfo *CodeInfo) { +void llvm::CloneAndPruneIntoFromInst( + Function *NewFunc, const Function *OldFunc, const Instruction *StartingInst, + ValueToValueMapTy &VMap, bool ModuleLevelChanges, + SmallVectorImpl &Returns, const char *NameSuffix, + ClonedCodeInfo *CodeInfo, BlockCloningFunctor Ftor) { assert(NameSuffix && "NameSuffix cannot be null!"); ValueMapTypeRemapper *TypeMapper = nullptr; @@ -461,11 +464,11 @@ // Clone the entry block, and anything recursively reachable from it. std::vector CloneWorklist; - PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); + PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist, Ftor); while (!CloneWorklist.empty()) { const BasicBlock *BB = CloneWorklist.back(); CloneWorklist.pop_back(); - PFC.CloneBlock(BB, BB->begin(), CloneWorklist); + PFC.CloneBlock(BB, BB->begin(), CloneWorklist, Ftor); } // Loop over all of the basic blocks in the old function. If the block was @@ -666,15 +669,14 @@ /// constant arguments cause a significant amount of code in the callee to be /// dead. Since this doesn't produce an exact copy of the input, it can't be /// used for things like CloneFunction or CloneModule. -void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc, - ValueToValueMapTy &VMap, - bool ModuleLevelChanges, - SmallVectorImpl &Returns, - const char *NameSuffix, - ClonedCodeInfo *CodeInfo, - Instruction *TheCall) { +void llvm::CloneAndPruneFunctionInto( + Function *NewFunc, const Function *OldFunc, ValueToValueMapTy &VMap, + bool ModuleLevelChanges, SmallVectorImpl &Returns, + const char *NameSuffix, ClonedCodeInfo *CodeInfo, Instruction *TheCall, + BlockCloningFunctor Ftor) { CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap, - ModuleLevelChanges, Returns, NameSuffix, CodeInfo); + ModuleLevelChanges, Returns, NameSuffix, CodeInfo, + Ftor); } /// \brief Remaps instructions in \p Blocks using the mapping in \p VMap. Index: lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- lib/Transforms/Utils/InlineFunction.cpp +++ lib/Transforms/Utils/InlineFunction.cpp @@ -1319,7 +1319,7 @@ // If IFI has any state in it, zap it before we fill it in. IFI.reset(); - + const Function *CalledFunc = CS.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! @@ -1473,7 +1473,7 @@ // happy with whatever the cloner can do. CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", - &InlinedFunctionInfo, TheCall); + &InlinedFunctionInfo, TheCall, IFI.Ftor); // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; Index: test/Transforms/Inline/function-count-update-2.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update-2.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -inline -S | FileCheck %s + +; This tests that the function count of a callee gets correctly updated after it +; has been inlined into a two callsites. + +; CHECK: @callee() !prof [[COUNT:![0-9]+]] +define i32 @callee() !prof !1 { + ret i32 0 +} + +define i32 @caller1() !prof !2 { + %i = call i32 @callee() + ret i32 %i +} + +define i32 @caller2() !prof !3 { + %i = call i32 @callee() + ret i32 %i +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 600} +!3 = !{!"function_entry_count", i64 400} + Index: test/Transforms/Inline/function-count-update-3.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update-3.ll @@ -0,0 +1,69 @@ +; RUN: opt < %s -inline -S -inline-threshold=40 | FileCheck %s + +; This tests that the function count of a function gets properly scaled after +; inlining a call chain leading to the function. +; Function a calls c with count 200 (C1) +; Function b calls c with count 300 +; Function c calls e with count 250 (C2) +; Entry count of e is 500 (C3) +; c->e inlining does not happen since the cost exceeds threshold. +; c then inlined into a. +; e now gets inlined into a (through c) since the branch condition in e is now +; known and hence the cost gets reduced. +; Estimated count of a->e callsite = C2 * (C1 / C3) +; Estimated count of a->e callsite = 250 * (200 / 500) = 100 +; Remaining count of e = C3 - 100 = 500 - 100 = 400 + +@data = external global i32 + +define i32 @a(i32 %a1) !prof !1 { + %a2 = call i32 @c(i32 %a1, i32 1) + ret i32 %a2 +} + +define i32 @b(i32 %b1) !prof !2 { + %b2 = call i32 @c(i32 %b1, i32 %b1) + ret i32 %b2 +} + +define i32 @c(i32 %c1, i32 %c100) !prof !3 { + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + ret i32 0 + +cond_true: + %c11 = call i32 @e(i32 %c100) + ret i32 %c11 +} + +; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]] +define i32 @e(i32 %c1) !prof !4 { + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + %c2 = load i32, i32* @data, align 4 + %c3 = add i32 %c1, %c2 + %c4 = mul i32 %c3, %c2 + %c5 = add i32 %c4, %c2 + %c6 = mul i32 %c5, %c2 + %c7 = add i32 %c6, %c2 + %c8 = mul i32 %c7, %c2 + %c9 = add i32 %c8, %c2 + %c10 = mul i32 %c9, %c2 + ret i32 %c10 + +cond_true: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400} +!0 = !{i32 1, !"MaxFunctionCount", i32 5000} +!1 = !{!"function_entry_count", i64 200} +!2 = !{!"function_entry_count", i64 300} +!3 = !{!"function_entry_count", i64 500} +!4 = !{!"function_entry_count", i64 500} +