Index: include/llvm/Analysis/BlockFrequencyInfo.h =================================================================== --- include/llvm/Analysis/BlockFrequencyInfo.h +++ include/llvm/Analysis/BlockFrequencyInfo.h @@ -69,6 +69,11 @@ // Set the frequency of the given basic block. void setBlockFreq(const BasicBlock *BB, uint64_t Freq); + /// Set the frequency of \p BB and scale the frequencies of \p BlocksToScale + /// such that their frequencies relative to \p BB remain the same. + void setBlockFreqAndScale(const BasicBlock *BB, uint64_t Freq, + SmallVectorImpl &BlocksToScale); + /// calculate - compute block frequency info for the given function. void calculate(const Function &F, const BranchProbabilityInfo &BPI, const LoopInfo &LI); Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -21,6 +21,7 @@ namespace llvm { class AssumptionCacheTracker; +class BlockFrequencyInfo; class CallSite; class DataLayout; class Function; @@ -137,6 +138,9 @@ /// Threshold to use when the callsite is considered hot. Optional HotCallSiteThreshold; + + /// Threshold to use when the callsite is considered cold. + Optional ColdCallSiteThreshold; }; /// Generate the parameters to tune the inline cost analysis based only on the @@ -171,6 +175,7 @@ getInlineCost(CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + Optional> GetBFI, ProfileSummaryInfo *PSI); /// \brief Get an InlineCost with the callee explicitly specified. @@ -182,6 +187,7 @@ getInlineCost(CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + Optional> GetBFI, ProfileSummaryInfo *PSI); /// \brief Minimal filter to detect invalid constructs for inlining. Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -29,6 +29,7 @@ namespace llvm { class BasicBlock; class BlockFrequencyInfo; +class CallSite; class ProfileSummary; /// \brief Analysis providing profile information. /// @@ -63,6 +64,12 @@ bool isColdCount(uint64_t C); /// \brief Returns true if BasicBlock \p B is considered hot. bool isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI); + /// \brief Returns true if BasicBlock \p B is considered cold. + bool isColdBB(const BasicBlock *B, BlockFrequencyInfo *BFI); + /// \brief Returns true if CallSite \p CS is considered hot. + bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); + /// \brief Returns true if Callsite \p CS is considered cold. + bool isColdCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); }; /// An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo. Index: include/llvm/Transforms/Utils/Cloning.h =================================================================== --- include/llvm/Transforms/Utils/Cloning.h +++ include/llvm/Transforms/Utils/Cloning.h @@ -47,6 +47,7 @@ class LoopInfo; class AllocaInst; class AssumptionCacheTracker; +class BlockFrequencyInfo; class DominatorTree; /// Return an exact copy of the specified module @@ -178,13 +179,17 @@ public: explicit InlineFunctionInfo(CallGraph *cg = nullptr, std::function - *GetAssumptionCache = nullptr) - : CG(cg), GetAssumptionCache(GetAssumptionCache) {} + *GetAssumptionCache = nullptr, + BlockFrequencyInfo *CallerBFI = nullptr, + BlockFrequencyInfo *CalleeBFI = nullptr) + : CG(cg), GetAssumptionCache(GetAssumptionCache), CallerBFI(CallerBFI), + CalleeBFI(CalleeBFI) {} /// CG - If non-null, InlineFunction will update the callgraph to reflect the /// changes it makes. CallGraph *CG; std::function *GetAssumptionCache; + BlockFrequencyInfo *CallerBFI, *CalleeBFI; /// StaticAllocas - InlineFunction fills this in with all static allocas that /// get copied into the caller. Index: lib/Analysis/BlockFrequencyInfo.cpp =================================================================== --- lib/Analysis/BlockFrequencyInfo.cpp +++ lib/Analysis/BlockFrequencyInfo.cpp @@ -171,6 +171,26 @@ BFI->setBlockFreq(BB, Freq); } +void BlockFrequencyInfo::setBlockFreqAndScale( + const BasicBlock *AnchorBB, uint64_t Freq, + SmallVectorImpl &BlocksToScale) { + assert(BFI && "Expected analysis to be available"); + // Use 128 bits APInt to avoid overflow. + APInt NewFreq(128, Freq); + APInt OldFreq(128, BFI->getBlockFreq(AnchorBB).getFrequency()); + for (auto *BB : BlocksToScale) { + + APInt BBFreq(128, BFI->getBlockFreq(BB).getFrequency()); + + // Multiply first by NewFreq and then divide by OldFreq + // to minimize loss of precision. + BBFreq *= NewFreq; + BBFreq = BBFreq.udiv(OldFreq); + BFI->setBlockFreq(BB, BBFreq.getLimitedValue()); + } + BFI->setBlockFreq(AnchorBB, Freq); +} + /// Pop up a ghostview window with the current block frequency propagation /// rendered using dot. void BlockFrequencyInfo::view() const { Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -48,6 +49,11 @@ "inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint")); +static cl::opt + ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, + cl::init(45), + cl::desc("Threshold for inlining cold callsites")); + // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. @@ -72,6 +78,9 @@ /// Getter for the cache of @llvm.assume intrinsics. std::function &GetAssumptionCache; + /// Getter for BlockFrequencyInfo + Optional> &GetBFI; + /// Profile summary information. ProfileSummaryInfo *PSI; @@ -85,7 +94,6 @@ /// Tunable parameters that control the analysis. const InlineParams &Params; - int Threshold; int Cost; @@ -202,19 +210,21 @@ public: CallAnalyzer(const TargetTransformInfo &TTI, std::function &GetAssumptionCache, + Optional> &GetBFI, ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, const InlineParams &Params) - : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee), - CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), - Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), - ExposesReturnsTwice(false), HasDynamicAlloca(false), - ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), - NumVectorInstructions(0), FiftyPercentVectorBonus(0), - TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), - NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), - NumConstantPtrDiffs(0), NumInstructionsSimplified(0), - SROACostSavings(0), SROACostSavingsLost(0) {} + : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), + PSI(PSI), F(Callee), CandidateCS(CSArg), Params(Params), + Threshold(Params.DefaultThreshold), Cost(0), IsCallerRecursive(false), + IsRecursiveCall(false), ExposesReturnsTwice(false), + HasDynamicAlloca(false), ContainsNoDuplicateCall(false), + HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), + AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), + FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0), + NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), + NumConstantPtrCmps(0), NumConstantPtrDiffs(0), + NumInstructionsSimplified(0), SROACostSavings(0), + SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); @@ -642,16 +652,21 @@ if (Callee.hasFnAttribute(Attribute::InlineHint)) Threshold = MaxIfValid(Threshold, Params.HintThreshold); if (PSI) { - uint64_t TotalWeight; - if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) && - PSI->isHotCount(TotalWeight)) { + BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; + if (PSI->isHotCallSite(CS, CallerBFI)) { + DEBUG(llvm::dbgs() << "Hot callsite.\n"); Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); } else if (PSI->isFunctionEntryHot(&Callee)) { + DEBUG(llvm::dbgs() << "Hot callee.\n"); // If callsite hotness can not be determined, we may still know // that the callee is hot and treat it as a weaker hint for threshold // increase. Threshold = MaxIfValid(Threshold, Params.HintThreshold); + } else if (PSI->isColdCallSite(CS, CallerBFI)) { + DEBUG(llvm::dbgs() << "Cold callsite.\n"); + Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); } else if (PSI->isFunctionEntryCold(&Callee)) { + DEBUG(llvm::dbgs() << "Cold callee.\n"); Threshold = MinIfValid(Threshold, Params.ColdThreshold); } } @@ -959,7 +974,8 @@ // out. Pretend to inline the function, with a custom threshold. auto IndirectCallParams = Params; IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams); + CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS, + IndirectCallParams); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -1449,15 +1465,17 @@ InlineCost llvm::getInlineCost( CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + Optional> GetBFI, ProfileSummaryInfo *PSI) { return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, - GetAssumptionCache, PSI); + GetAssumptionCache, GetBFI, PSI); } InlineCost llvm::getInlineCost( CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + Optional> GetBFI, ProfileSummaryInfo *PSI) { // Cannot inline indirect calls. @@ -1492,7 +1510,8 @@ DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params); + CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS, + Params); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); @@ -1565,6 +1584,9 @@ // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. Params.HotCallSiteThreshold = HotCallSiteThreshold; + // Set the ColdCallSiteThreshold knob from the -cold-callsite-threshold. + Params.ColdCallSiteThreshold = ColdCallSiteThreshold; + // Set the OptMinSizeThreshold and OptSizeThreshold params only if the // Set the OptMinSizeThreshold and OptSizeThreshold params only if the // -inlinehint-threshold commandline option is not explicitly given. If that Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -12,9 +12,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallSite.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ProfileSummary.h" @@ -135,12 +136,46 @@ // not update/scale branch weights. Unlike false negatives, this will not cause // performance problem. uint64_t TotalCount; - if (B->getTerminator()->extractProfTotalWeight(TotalCount) && - isHotCount(TotalCount)) + auto *TI = B->getTerminator(); + if (isa(TI)) + return false; + if (TI->extractProfTotalWeight(TotalCount) && isHotCount(TotalCount)) return true; return false; } +bool ProfileSummaryInfo::isColdBB(const BasicBlock *B, + BlockFrequencyInfo *BFI) { + auto Count = BFI->getBlockProfileCount(B); + return Count && isColdCount(*Count); +} + +bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS, + BlockFrequencyInfo *BFI) { + auto *CallInst = CS.getInstruction(); + if (!CS) + return false; + // Check if there is a profile metadata on the instruction. If it is present, + // determine hotness solely based on that. + uint64_t TotalCount; + if (CallInst->extractProfTotalWeight(TotalCount)) + return isHotCount(TotalCount); + return BFI && isHotBB(CallInst->getParent(), BFI); +} + +bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS, + BlockFrequencyInfo *BFI) { + auto *CallInst = CS.getInstruction(); + if (!CS) + return false; + // Check if there is a profile metadata on the instruction. If it is present, + // and tells that the callsite is not cold, then return false; + uint64_t TotalCount; + if (CallInst->extractProfTotalWeight(TotalCount) && !isColdCount(TotalCount)) + return false; + return BFI && isColdBB(CallInst->getParent(), BFI); +} + INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info", "Profile summary info", false, true) Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -61,7 +61,8 @@ [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI); + return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, + /*GetBFI=*/None, PSI); } bool runOnSCC(CallGraphSCC &SCC) override; Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -765,15 +766,17 @@ [&](Function &F) -> AssumptionCache & { return FAM.getResult(F); }; + auto GetBFILambda = [&](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult(F); + }; - // Setup the data structure used to plumb customization into the - // `InlineFunction` routine. - InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache); + function_ref GetBFI(GetBFILambda); auto GetInlineCost = [&](CallSite CS) { Function &Callee = *CS.getCalledFunction(); auto &CalleeTTI = FAM.getResult(Callee); - return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI); + return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, GetBFI, + PSI); }; // We use a worklist of nodes to process so that we can handle if the SCC @@ -843,6 +846,13 @@ if (!shouldInline(CS, GetInlineCost, ORE)) continue; + // Setup the data structure used to plumb customization into the + // `InlineFunction` routine. + InlineFunctionInfo IFI( + /*cg=*/nullptr, &GetAssumptionCache, + &FAM.getResult(*(CS.getCaller())), + &FAM.getResult(Callee)); + if (!InlineFunction(CS, IFI)) continue; DidInline = true; Index: lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- lib/Transforms/Utils/InlineFunction.cpp +++ lib/Transforms/Utils/InlineFunction.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/EHPersonalities.h" @@ -40,8 +41,8 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" #include using namespace llvm; @@ -1393,6 +1394,56 @@ } } } +/// Update the block frequencies of the caller after a callee has been inlined. +/// +/// Each block cloned into the caller has its block frequency scaled by the +/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of +/// callee's entry block gets the same freqquency as the callsite block and the +/// relative frequencies of all cloned blocks remain the same after cloning. +static void updateCallerBFI(BasicBlock *CallSiteBlock, + const ValueToValueMapTy &VMap, + BlockFrequencyInfo *CallerBFI, + BlockFrequencyInfo *CalleeBFI, + const BasicBlock &CalleeEntryBlock) { + SmallVector ClonedBBs; + for (auto const &Entry : VMap) { + if (!isa(Entry.first) || !Entry.second) + continue; + const BasicBlock *OrigBB = cast(Entry.first); + BasicBlock *ClonedBB = cast(Entry.second); + ClonedBBs.push_back(ClonedBB); + CallerBFI->setBlockFreq(ClonedBB, + CalleeBFI->getBlockFreq(OrigBB).getFrequency()); + } + BasicBlock *EntryClone = cast(VMap.lookup(&CalleeEntryBlock)); + CallerBFI->setBlockFreqAndScale( + EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(), + ClonedBBs); +} + +/// Update the entry count of callee after inlining. +/// +/// The callsite's block count is subtracted from the callee's function entry +/// count. +static void updateCalleeCount(BlockFrequencyInfo &CallerBFI, BasicBlock *CallBB, + Function *Callee) { + // If the callee has a original count of N, and the estimated count of + // callsite is M, the new callee count is set to N - M. M is estimated from + // the caller's entry count, its entry block frequency and the block frequency + // of the callsite. + Optional CalleeCount = Callee->getEntryCount(); + if (!CalleeCount) + return; + Optional CallSiteCount = CallerBFI.getBlockProfileCount(CallBB); + if (!CallSiteCount) + return; + // Since CallSiteCount is an estimate, it could exceed the original callee + // count and has to be set to 0. + if (CallSiteCount.getValue() > CalleeCount.getValue()) + Callee->setEntryCount(0); + else + Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue()); +} /// This function inlines the called function into the basic block of the /// caller. This returns false if it is not possible to inline this call. @@ -1410,8 +1461,8 @@ // If IFI has any state in it, zap it before we fill it in. IFI.reset(); - - const Function *CalledFunc = CS.getCalledFunction(); + + Function *CalledFunc = CS.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! CalledFunc->getFunctionType()->isVarArg()) return false; @@ -1578,10 +1629,17 @@ CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", &InlinedFunctionInfo, TheCall); - // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; + if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr) { + // Update the BFI of blocks cloned into the caller. + updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI, + CalledFunc->front()); + // Update the profile count of callee. + updateCalleeCount(*IFI.CallerBFI, OrigBB, CalledFunc); + } + // Inject byval arguments initialization. for (std::pair &Init : ByValInit) HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(), @@ -2182,6 +2240,11 @@ // unreachable, delete it. It can only contain a bitcast and ret. if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB)) AfterCallBB->eraseFromParent(); + else if (IFI.CallerBFI) { + // Copy original BB's block frequency to AfterCallBB + IFI.CallerBFI->setBlockFreq( + AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency()); + } // We should always be able to fold the entry block of the function into the // single predecessor of the block... Index: test/Transforms/Inline/function-count-update-2.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update-2.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S | FileCheck %s + +; This tests that the function count of a callee gets correctly updated after it +; has been inlined into a two callsites. + +; CHECK: @callee() !prof [[COUNT:![0-9]+]] +define i32 @callee() !prof !1 { + ret i32 0 +} + +define i32 @caller1() !prof !2 { + %i = call i32 @callee() + ret i32 %i +} + +define i32 @caller2() !prof !3 { + %i = call i32 @callee() + ret i32 %i +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 600} +!3 = !{!"function_entry_count", i64 400} + Index: test/Transforms/Inline/function-count-update-3.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update-3.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S -inline-threshold=50 | FileCheck %s + +; This tests that the function count of a function gets properly scaled after +; inlining a call chain leading to the function. +; Function a calls c with count 200 (C1) +; Function b calls c with count 300 +; Function c calls e with count 250 (C2) +; Entry count of e is 500 (C3) +; c->e inlining does not happen since the cost exceeds threshold. +; c then inlined into a. +; e now gets inlined into a (through c) since the branch condition in e is now +; known and hence the cost gets reduced. +; Estimated count of a->e callsite = C2 * (C1 / C3) +; Estimated count of a->e callsite = 250 * (200 / 500) = 100 +; Remaining count of e = C3 - 100 = 500 - 100 = 400 + +@data = external global i32 + +define i32 @a(i32 %a1) !prof !1 { + %a2 = call i32 @c(i32 %a1, i32 1) + ret i32 %a2 +} + +define i32 @b(i32 %b1) !prof !2 { + %b2 = call i32 @c(i32 %b1, i32 %b1) + ret i32 %b2 +} + +declare void @ext(); + +define i32 @c(i32 %c1, i32 %c100) !prof !3 { + call void @ext() + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + ret i32 0 + +cond_true: + %c11 = call i32 @e(i32 %c100) + ret i32 %c11 +} + + +; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]] +define i32 @e(i32 %c1) !prof !4 { + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + call void @ext() + %c2 = load i32, i32* @data, align 4 + %c3 = add i32 %c1, %c2 + %c4 = mul i32 %c3, %c2 + %c5 = add i32 %c4, %c2 + %c6 = mul i32 %c5, %c2 + %c7 = add i32 %c6, %c2 + %c8 = mul i32 %c7, %c2 + %c9 = add i32 %c8, %c2 + %c10 = mul i32 %c9, %c2 + ret i32 %c10 + +cond_true: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400} +!0 = !{i32 1, !"MaxFunctionCount", i32 5000} +!1 = !{!"function_entry_count", i64 200} +!2 = !{!"function_entry_count", i64 300} +!3 = !{!"function_entry_count", i64 500} +!4 = !{!"function_entry_count", i64 500} + Index: test/Transforms/Inline/function-count-update.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S | FileCheck %s + +; This tests that the function count of two callees get correctly updated after +; they have been inlined into two back-to-back callsites in a single basic block +; in the caller. The callees have the alwaysinline attribute and so they get +; inlined both with the regular inliner pass and the always inline pass. In +; both cases, the new count of each callee is the original count minus callsite +; count which is 200 (since the caller's entry count is 400 and the block +; containing the calls have a relative block frequency of 0.5). + +; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]] +define i32 @callee1(i32 %n) #0 !prof !1 { + %cond = icmp sle i32 %n, 10 + br i1 %cond, label %cond_true, label %cond_false + +cond_true: + %r1 = add i32 %n, 1 + ret i32 %r1 +cond_false: + %r2 = add i32 %n, 2 + ret i32 %r2 +} + +; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]] +define i32 @callee2(i32 %n) #0 !prof !2 { + %r1 = add i32 %n, 1 + ret i32 %r1 +} + +define i32 @caller(i32 %n) !prof !3 { + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false + +cond_true: + %i = call i32 @callee1(i32 %n) + %j = call i32 @callee2(i32 %i) + ret i32 %j +cond_false: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800} +; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 2000} +!3 = !{!"function_entry_count", i64 400} +attributes #0 = { alwaysinline } + Index: test/Transforms/Inline/inline-cold-callee.ll =================================================================== --- test/Transforms/Inline/inline-cold-callee.ll +++ test/Transforms/Inline/inline-cold-callee.ll @@ -1,5 +1,4 @@ ; RUN: opt < %s -inline -inlinecold-threshold=0 -S | FileCheck %s -; RUN: opt < %s -passes='require,cgscc(inline)' -inlinecold-threshold=0 -S | FileCheck %s ; This tests that a cold callee gets the (lower) inlinecold-threshold even without ; Cold hint and does not get inlined because the cost exceeds the inlinecold-threshold. Index: test/Transforms/Inline/inline-cold-callsite.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/inline-cold-callsite.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -inline-threshold=100 -inline-cold-callsite-threshold=0 -S | FileCheck %s + +; This tests that a cold callsite gets the inline-cold-callsite-threshold +; and does not get inlined. Another callsite to an identical callee that +; is not cold gets inlined because cost is below the inline-threshold. + +define i32 @callee1(i32 %x) !prof !21 { + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @callee2(i32 %x) !prof !21 { +; CHECK-LABEL: @callee2( + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @caller(i32 %n) !prof !22 { +; CHECK-LABEL: @caller( + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false, !prof !0 + +cond_true: +; CHECK-NOT: call i32 @callee1 +; CHECK: ret i32 %x3.i + %i = call i32 @callee1(i32 %n) + ret i32 %i +cond_false: +; CHECK: call i32 @callee2 +; CHECK: ret i32 %j + %j = call i32 @callee2(i32 %n) + ret i32 %j +} +declare void @extern() + +!0 = !{!"branch_weights", i32 200, i32 1} + +!llvm.module.flags = !{!1} +!21 = !{!"function_entry_count", i64 200} +!22 = !{!"function_entry_count", i64 200} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 1000, i32 1} +!13 = !{i32 999000, i64 1000, i32 1} +!14 = !{i32 999999, i64 1, i32 2} Index: test/Transforms/Inline/inline-hot-callee.ll =================================================================== --- test/Transforms/Inline/inline-hot-callee.ll +++ test/Transforms/Inline/inline-hot-callee.ll @@ -1,5 +1,4 @@ ; RUN: opt < %s -inline -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s -; RUN: opt < %s -passes='require,cgscc(inline)' -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s ; This tests that a hot callee gets the (higher) inlinehint-threshold even without ; inline hints and gets inlined because the cost is less than inlinehint-threshold. Index: test/Transforms/Inline/inline-hot-callsite-2.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/inline-hot-callsite-2.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -inline-threshold=0 -inlinehint-threshold=0 -hot-callsite-threshold=100 -S | FileCheck %s + +; This tests that a callsite which is determined to be hot based on the caller's +; entry count and the callsite block frequency gets the hot-callsite-threshold. +; Another callsite to an identical callee that is not hot does not get inlined +; because cost exceeds the inline-threshold. inlinthint-threshold is set to 0 +; to ensure callee's hotness is not used to boost the threshold. + +define i32 @callee1(i32 %x) !prof !21 { + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @callee2(i32 %x) !prof !21 { +; CHECK-LABEL: @callee2( + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @caller(i32 %n) !prof !22 { +; CHECK-LABEL: @caller( + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false, !prof !0 + +cond_true: +; CHECK-NOT: call i32 @callee1 +; CHECK: ret i32 %x3.i + %i = call i32 @callee1(i32 %n) + ret i32 %i +cond_false: +; CHECK: call i32 @callee2 +; CHECK: ret i32 %j + %j = call i32 @callee2(i32 %n) + ret i32 %j +} +declare void @extern() + +!0 = !{!"branch_weights", i32 64, i32 4} + +!llvm.module.flags = !{!1} +!21 = !{!"function_entry_count", i64 200} +!22 = !{!"function_entry_count", i64 200} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2}