Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -21,6 +21,7 @@ namespace llvm { class AssumptionCacheTracker; +class BlockFrequencyInfo; class CallSite; class DataLayout; class Function; @@ -137,6 +138,9 @@ /// Threshold to use when the callsite is considered hot. Optional HotCallSiteThreshold; + + /// Threshold to use when the callsite is considered cold. + Optional ColdCallSiteThreshold; }; /// Generate the parameters to tune the inline cost analysis based only on the @@ -171,6 +175,7 @@ getInlineCost(CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + std::function *GetBFI, ProfileSummaryInfo *PSI); /// \brief Get an InlineCost with the callee explicitly specified. @@ -182,6 +187,7 @@ getInlineCost(CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + std::function *GetBFI, ProfileSummaryInfo *PSI); /// \brief Minimal filter to detect invalid constructs for inlining. Index: include/llvm/Transforms/Utils/Cloning.h =================================================================== --- include/llvm/Transforms/Utils/Cloning.h +++ include/llvm/Transforms/Utils/Cloning.h @@ -47,6 +47,7 @@ class LoopInfo; class AllocaInst; class AssumptionCacheTracker; +class BlockFrequencyInfo; class DominatorTree; /// Return an exact copy of the specified module @@ -80,6 +81,9 @@ /// originally inserted callsites were DCE'ed after they were cloned. std::vector OperandBundleCallSites; + /// Map from original block to new blocks updated by the cloner. + std::unique_ptr> ClonedBlocks; + ClonedCodeInfo() : ContainsCalls(false), ContainsDynamicAllocas(false) {} }; @@ -176,15 +180,18 @@ /// InlineFunction call, and records the auxiliary results produced by it. class InlineFunctionInfo { public: - explicit InlineFunctionInfo(CallGraph *cg = nullptr, - std::function - *GetAssumptionCache = nullptr) - : CG(cg), GetAssumptionCache(GetAssumptionCache) {} + explicit InlineFunctionInfo( + CallGraph *cg = nullptr, + std::function *GetAssumptionCache = + nullptr, + std::function *GetBFI = nullptr) + : CG(cg), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI) {} /// CG - If non-null, InlineFunction will update the callgraph to reflect the /// changes it makes. CallGraph *CG; std::function *GetAssumptionCache; + std::function *GetBFI; /// StaticAllocas - InlineFunction fills this in with all static allocas that /// get copied into the caller. Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -48,6 +49,11 @@ "inlinehint-threshold", cl::Hidden, cl::init(325), cl::desc("Threshold for inlining functions with inline hint")); +static cl::opt + ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, + cl::init(45), + cl::desc("Threshold for inlining cold callsites")); + // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. @@ -72,6 +78,9 @@ /// Getter for the cache of @llvm.assume intrinsics. std::function &GetAssumptionCache; + /// Getter for BlockFrequencyInfo + std::function *GetBFI; + /// Profile summary information. ProfileSummaryInfo *PSI; @@ -85,7 +94,6 @@ /// Tunable parameters that control the analysis. const InlineParams &Params; - int Threshold; int Cost; @@ -202,19 +210,21 @@ public: CallAnalyzer(const TargetTransformInfo &TTI, std::function &GetAssumptionCache, + std::function *GetBFI, ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, const InlineParams &Params) - : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee), - CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), - Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), - ExposesReturnsTwice(false), HasDynamicAlloca(false), - ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), - HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), - NumVectorInstructions(0), FiftyPercentVectorBonus(0), - TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), - NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), - NumConstantPtrDiffs(0), NumInstructionsSimplified(0), - SROACostSavings(0), SROACostSavingsLost(0) {} + : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), + PSI(PSI), F(Callee), CandidateCS(CSArg), Params(Params), + Threshold(Params.DefaultThreshold), Cost(0), IsCallerRecursive(false), + IsRecursiveCall(false), ExposesReturnsTwice(false), + HasDynamicAlloca(false), ContainsNoDuplicateCall(false), + HasReturn(false), HasIndirectBr(false), HasFrameEscape(false), + AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), + FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0), + NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), + NumConstantPtrCmps(0), NumConstantPtrDiffs(0), + NumInstructionsSimplified(0), SROACostSavings(0), + SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); @@ -642,15 +652,18 @@ PSI->isHotCount(TotalWeight)) { HotCallsite = true; } + bool UseCallsiteHotness = GetBFI != nullptr; // Listen to the inlinehint attribute or profile based hotness information // when it would increase the threshold and the caller does not need to // minimize its size. - bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) || - (PSI && PSI->isFunctionEntryHot(&Callee)); + bool InlineHint = + Callee.hasFnAttribute(Attribute::InlineHint) || + (!UseCallsiteHotness && PSI && PSI->isFunctionEntryHot(&Callee)); if (InlineHint && !Caller->optForMinSize()) Threshold = MaxIfValid(Threshold, Params.HintThreshold); + BasicBlock *CallBB = CS.getInstruction()->getParent(); if (HotCallsite && !Caller->optForMinSize()) Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); @@ -659,6 +672,19 @@ // the threshold. if (ColdCallee) Threshold = MinIfValid(Threshold, Params.ColdThreshold); + if (UseCallsiteHotness && PSI && CallBB) { + auto &CallerBFI = (*GetBFI)(*Caller); + auto CallsiteCount = CallerBFI.getBlockProfileCount(CallBB); + if (CallsiteCount) { + DEBUG(llvm::dbgs() + << " Callsite count = " << CallsiteCount.getValue() + << (PSI->isHotCount(CallsiteCount.getValue()) ? " (HOT)\n" : "\n")); + if (PSI->isHotCount(CallsiteCount.getValue())) + Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); + else if (PSI->isColdCount(CallsiteCount.getValue())) + Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); + } + } // Finally, take the target-specific inlining threshold multiplier into // account. @@ -962,7 +988,8 @@ // out. Pretend to inline the function, with a custom threshold. auto IndirectCallParams = Params; IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams); + CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS, + IndirectCallParams); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -1452,15 +1479,17 @@ InlineCost llvm::getInlineCost( CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + std::function *GetBFI, ProfileSummaryInfo *PSI) { return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, - GetAssumptionCache, PSI); + GetAssumptionCache, GetBFI, PSI); } InlineCost llvm::getInlineCost( CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, + std::function *GetBFI, ProfileSummaryInfo *PSI) { // Cannot inline indirect calls. @@ -1495,7 +1524,8 @@ DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params); + CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS, + Params); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); @@ -1568,6 +1598,9 @@ // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. Params.HotCallSiteThreshold = HotCallSiteThreshold; + // Set the ColdCallSiteThreshold knob from the -cold-callsite-threshold. + Params.ColdCallSiteThreshold = ColdCallSiteThreshold; + // Set the OptMinSizeThreshold and OptSizeThreshold params only if the // Set the OptMinSizeThreshold and OptSizeThreshold params only if the // -inlinehint-threshold commandline option is not explicitly given. If that Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -61,7 +61,8 @@ [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI); + return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, + /*GetBFI=*/nullptr, PSI); } bool runOnSCC(CallGraphSCC &SCC) override; Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -766,14 +767,20 @@ return FAM.getResult(F); }; + std::function GetBFI = + [&](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult(F); + }; + // Setup the data structure used to plumb customization into the // `InlineFunction` routine. - InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache); + InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache, &GetBFI); auto GetInlineCost = [&](CallSite CS) { Function &Callee = *CS.getCalledFunction(); auto &CalleeTTI = FAM.getResult(Callee); - return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI); + return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, &GetBFI, + PSI); }; // We use a worklist of nodes to process so that we can handle if the SCC Index: lib/Transforms/Utils/CloneFunction.cpp =================================================================== --- lib/Transforms/Utils/CloneFunction.cpp +++ lib/Transforms/Utils/CloneFunction.cpp @@ -255,6 +255,8 @@ // Nope, clone it now. BasicBlock *NewBB; BBEntry = NewBB = BasicBlock::Create(BB->getContext()); + if (CodeInfo->ClonedBlocks) + (*CodeInfo->ClonedBlocks)[BB] = NewBB; if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); // It is only legal to clone a function if a block address within that Index: lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- lib/Transforms/Utils/InlineFunction.cpp +++ lib/Transforms/Utils/InlineFunction.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/EHPersonalities.h" @@ -1394,6 +1395,60 @@ } } +/// Update the block frequencies of the caller after a callee has been inlined. +/// +/// Each block cloned into the caller has its block frequency scaled by the +/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of +/// callee's entry block gets the same freqquency as the callsite block and the +/// relative frequencies of all cloned blocks remain the same after cloning. +static void +updateCallerBFI(BasicBlock *CallSiteBlock, + DenseMap *ClonedBlocks, + InlineFunctionInfo &IFI, BlockFrequencyInfo &CallerBFI, + BlockFrequencyInfo &CalleeBFI) { + if (!ClonedBlocks) + return; + + for (auto &Entry : *ClonedBlocks) { + // Use 128 bits APInt to avoid overflow. + APInt CallSiteFreq(128, + CallerBFI.getBlockFreq(CallSiteBlock).getFrequency()); + APInt CalleeEntryFreq(128, CalleeBFI.getEntryFreq()); + APInt BBFreq(128, CalleeBFI.getBlockFreq(Entry.first).getFrequency()); + + // Multiply first by CallSiteFreq and then divide by CalleeEntryFreq + // to minimize loss of precision. + BBFreq *= CallSiteFreq; + BBFreq = BBFreq.udiv(CalleeEntryFreq); + + CallerBFI.setBlockFreq(Entry.second, BBFreq.getLimitedValue()); + } +} + +/// Update the entry count of callee after inlining. +/// +/// The callsite's block count is subtracted from the callee's function entry +/// count. +static void updateCalleeCount(BlockFrequencyInfo &CallerBFI, BasicBlock *CallBB, + Function *Callee) { + // If the callee has a original count of N, and the estimated count of + // callsite is M, the new callee count is set to N - M. M is estimated from + // the caller's entry count, its entry block frequency and the block frequency + // of the callsite. + Optional CalleeCount = Callee->getEntryCount(); + if (!CalleeCount) + return; + Optional CallSiteCount = CallerBFI.getBlockProfileCount(CallBB); + if (!CallSiteCount) + return; + // Since CallSiteCount is an estimate, it could exceed the original callee + // count and has to be set to 0. + if (CallSiteCount.getValue() > CalleeCount.getValue()) + Callee->setEntryCount(0); + else + Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue()); +} + /// This function inlines the called function into the basic block of the /// caller. This returns false if it is not possible to inline this call. /// The program is still in a well defined state if this occurs though. @@ -1410,8 +1465,8 @@ // If IFI has any state in it, zap it before we fill it in. IFI.reset(); - - const Function *CalledFunc = CS.getCalledFunction(); + + Function *CalledFunc = CS.getCalledFunction(); if (!CalledFunc || // Can't inline external function or indirect CalledFunc->isDeclaration() || // call, or call to a vararg function! CalledFunc->getFunctionType()->isVarArg()) return false; @@ -1532,6 +1587,8 @@ // function. SmallVector Returns; ClonedCodeInfo InlinedFunctionInfo; + InlinedFunctionInfo.ClonedBlocks.reset( + new DenseMap()); Function::iterator FirstNewBlock; { // Scope to destroy VMap after cloning. @@ -1578,6 +1635,15 @@ CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, /*ModuleLevelChanges=*/false, Returns, ".i", &InlinedFunctionInfo, TheCall); + if (IFI.GetBFI) { + auto &CallerBFI = (*IFI.GetBFI)(*Caller); + auto &CalleeBFI = (*IFI.GetBFI)(*CalledFunc); + // Update the block frequencies of the cloned blocks in the caller + updateCallerBFI(OrigBB, InlinedFunctionInfo.ClonedBlocks.get(), IFI, + CallerBFI, CalleeBFI); + // Update the profile count of callee. + updateCalleeCount(CallerBFI, OrigBB, CalledFunc); + } // Remember the first block that is newly cloned over. FirstNewBlock = LastBlock; ++FirstNewBlock; @@ -2182,6 +2248,12 @@ // unreachable, delete it. It can only contain a bitcast and ret. if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB)) AfterCallBB->eraseFromParent(); + else if (IFI.GetBFI) { + auto &CallerBFI = (*IFI.GetBFI)(*Caller); + // Copy original BB's block frequency to AfterCallBB + CallerBFI.setBlockFreq(AfterCallBB, + CallerBFI.getBlockFreq(OrigBB).getFrequency()); + } // We should always be able to fold the entry block of the function into the // single predecessor of the block... Index: test/Transforms/Inline/function-count-update-2.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update-2.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S | FileCheck %s + +; This tests that the function count of a callee gets correctly updated after it +; has been inlined into a two callsites. + +; CHECK: @callee() !prof [[COUNT:![0-9]+]] +define i32 @callee() !prof !1 { + ret i32 0 +} + +define i32 @caller1() !prof !2 { + %i = call i32 @callee() + ret i32 %i +} + +define i32 @caller2() !prof !3 { + %i = call i32 @callee() + ret i32 %i +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 600} +!3 = !{!"function_entry_count", i64 400} + Index: test/Transforms/Inline/function-count-update-3.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update-3.ll @@ -0,0 +1,74 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S -inline-threshold=50 | FileCheck %s + +; This tests that the function count of a function gets properly scaled after +; inlining a call chain leading to the function. +; Function a calls c with count 200 (C1) +; Function b calls c with count 300 +; Function c calls e with count 250 (C2) +; Entry count of e is 500 (C3) +; c->e inlining does not happen since the cost exceeds threshold. +; c then inlined into a. +; e now gets inlined into a (through c) since the branch condition in e is now +; known and hence the cost gets reduced. +; Estimated count of a->e callsite = C2 * (C1 / C3) +; Estimated count of a->e callsite = 250 * (200 / 500) = 100 +; Remaining count of e = C3 - 100 = 500 - 100 = 400 + +@data = external global i32 + +define i32 @a(i32 %a1) !prof !1 { + %a2 = call i32 @c(i32 %a1, i32 1) + ret i32 %a2 +} + +define i32 @b(i32 %b1) !prof !2 { + %b2 = call i32 @c(i32 %b1, i32 %b1) + ret i32 %b2 +} + +declare void @ext(); + +define i32 @c(i32 %c1, i32 %c100) !prof !3 { + call void @ext() + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + ret i32 0 + +cond_true: + %c11 = call i32 @e(i32 %c100) + ret i32 %c11 +} + + +; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]] +define i32 @e(i32 %c1) !prof !4 { + %cond = icmp sle i32 %c1, 1 + br i1 %cond, label %cond_true, label %cond_false + +cond_false: + call void @ext() + %c2 = load i32, i32* @data, align 4 + %c3 = add i32 %c1, %c2 + %c4 = mul i32 %c3, %c2 + %c5 = add i32 %c4, %c2 + %c6 = mul i32 %c5, %c2 + %c7 = add i32 %c6, %c2 + %c8 = mul i32 %c7, %c2 + %c9 = add i32 %c8, %c2 + %c10 = mul i32 %c9, %c2 + ret i32 %c10 + +cond_true: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400} +!0 = !{i32 1, !"MaxFunctionCount", i32 5000} +!1 = !{!"function_entry_count", i64 200} +!2 = !{!"function_entry_count", i64 300} +!3 = !{!"function_entry_count", i64 500} +!4 = !{!"function_entry_count", i64 500} + Index: test/Transforms/Inline/function-count-update.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/function-count-update.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -S | FileCheck %s + +; This tests that the function count of two callees get correctly updated after +; they have been inlined into two back-to-back callsites in a single basic block +; in the caller. The callees have the alwaysinline attribute and so they get +; inlined both with the regular inliner pass and the always inline pass. In +; both cases, the new count of each callee is the original count minus callsite +; count which is 200 (since the caller's entry count is 400 and the block +; containing the calls have a relative block frequency of 0.5). + +; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]] +define i32 @callee1(i32 %n) #0 !prof !1 { + %cond = icmp sle i32 %n, 10 + br i1 %cond, label %cond_true, label %cond_false + +cond_true: + %r1 = add i32 %n, 1 + ret i32 %r1 +cond_false: + %r2 = add i32 %n, 2 + ret i32 %r2 +} + +; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]] +define i32 @callee2(i32 %n) #0 !prof !2 { + %r1 = add i32 %n, 1 + ret i32 %r1 +} + +define i32 @caller(i32 %n) !prof !3 { + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false + +cond_true: + %i = call i32 @callee1(i32 %n) + %j = call i32 @callee2(i32 %i) + ret i32 %j +cond_false: + ret i32 0 +} + +!llvm.module.flags = !{!0} +; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800} +; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800} +!0 = !{i32 1, !"MaxFunctionCount", i32 1000} +!1 = !{!"function_entry_count", i64 1000} +!2 = !{!"function_entry_count", i64 2000} +!3 = !{!"function_entry_count", i64 400} +attributes #0 = { alwaysinline } + Index: test/Transforms/Inline/inline-cold-callsite.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/inline-cold-callsite.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -inline-threshold=100 -inline-cold-callsite-threshold=0 -S | FileCheck %s + +; This tests that a cold callsite gets the inline-cold-callsite-threshold +; and does not get inlined. Another callsite to an identical callee that +; is not cold gets inlined because cost is below the inline-threshold. + +define i32 @callee1(i32 %x) !prof !21 { + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @callee2(i32 %x) !prof !21 { +; CHECK-LABEL: @callee2( + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @caller(i32 %n) !prof !22 { +; CHECK-LABEL: @caller( + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false, !prof !0 + +cond_true: +; CHECK-NOT: call i32 @callee1 +; CHECK: ret i32 %x3.i + %i = call i32 @callee1(i32 %n) + ret i32 %i +cond_false: +; CHECK: call i32 @callee2 +; CHECK: ret i32 %j + %j = call i32 @callee2(i32 %n) + ret i32 %j +} +declare void @extern() + +!0 = !{!"branch_weights", i32 200, i32 1} + +!llvm.module.flags = !{!1} +!21 = !{!"function_entry_count", i64 200} +!22 = !{!"function_entry_count", i64 200} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 1000, i32 1} +!13 = !{i32 999000, i64 1000, i32 1} +!14 = !{i32 999999, i64 1, i32 2} Index: test/Transforms/Inline/inline-hot-callee.ll =================================================================== --- test/Transforms/Inline/inline-hot-callee.ll +++ test/Transforms/Inline/inline-hot-callee.ll @@ -1,5 +1,4 @@ ; RUN: opt < %s -inline -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s -; RUN: opt < %s -passes='require,cgscc(inline)' -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s ; This tests that a hot callee gets the (higher) inlinehint-threshold even without ; inline hints and gets inlined because the cost is less than inlinehint-threshold. Index: test/Transforms/Inline/inline-hot-callsite-2.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/inline-hot-callsite-2.ll @@ -0,0 +1,62 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -inline-threshold=0 -hot-callsite-threshold=100 -S | FileCheck %s + +; This tests that a callsite which is determined to be hot based on the caller's +; entry count and the callsite block frequency gets the hot-callsite-threshold. +; Another callsite to an identical callee that is not hot does not get inlined +; because cost exceeds the inline-threshold + +define i32 @callee1(i32 %x) !prof !21 { + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @callee2(i32 %x) !prof !21 { +; CHECK-LABEL: @callee2( + %x1 = add i32 %x, 1 + %x2 = add i32 %x1, 1 + %x3 = add i32 %x2, 1 + call void @extern() + ret i32 %x3 +} + +define i32 @caller(i32 %n) !prof !22 { +; CHECK-LABEL: @caller( + %cond = icmp sle i32 %n, 100 + br i1 %cond, label %cond_true, label %cond_false, !prof !0 + +cond_true: +; CHECK-NOT: call i32 @callee1 +; CHECK: ret i32 %x3.i + %i = call i32 @callee1(i32 %n) + ret i32 %i +cond_false: +; CHECK: call i32 @callee2 +; CHECK: ret i32 %j + %j = call i32 @callee2(i32 %n) + ret i32 %j +} +declare void @extern() + +!0 = !{!"branch_weights", i32 64, i32 4} + +!llvm.module.flags = !{!1} +!21 = !{!"function_entry_count", i64 200} +!22 = !{!"function_entry_count", i64 200} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2}