diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -23,6 +23,7 @@ #include "llvm/ProfileData/SampleProf.h" #include #include +#include using namespace llvm; using namespace sampleprof; @@ -42,7 +43,7 @@ CallSiteLoc(CallLoc){}; ContextTrieNode *getChildContext(const LineLocation &CallSite, StringRef CalleeName); - ContextTrieNode *getChildContext(const LineLocation &CallSite); + ContextTrieNode *getHottestChildContext(const LineLocation &CallSite); ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate = true); @@ -94,6 +95,9 @@ // call-site. The full context is identified by location of call instruction. FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst, StringRef CalleeName); + // Get samples for indirect call targets for call site at given location. + std::vector + getIndirectCalleeContextSamplesFor(const DILocation *DIL); // Query context profile for a given location. The full context // is identified by input DILocation. FunctionSamples *getContextSamplesFor(const DILocation *DIL); diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -30,7 +30,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, StringRef CalleeName) { if (CalleeName.empty()) - return getChildContext(CallSite); + return getHottestChildContext(CallSite); uint32_t Hash = nodeHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); @@ -40,18 +40,22 @@ } ContextTrieNode * -ContextTrieNode::getChildContext(const LineLocation &CallSite) { +ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { // CSFDO-TODO: This could be slow, change AllChildContext so we can // do point look up for child node by call site alone. - // CSFDO-TODO: Return the child with max count for indirect call + // Retrieve the child node with max count for indirect call ContextTrieNode *ChildNodeRet = nullptr; + uint64_t MaxCalleeSamples = 0; for (auto &It : AllChildContext) { ContextTrieNode &ChildNode = It.second; - if (ChildNode.CallSiteLoc == CallSite) { - if (ChildNodeRet) - return nullptr; - else - ChildNodeRet = &ChildNode; + if (ChildNode.CallSiteLoc != CallSite) + continue; + FunctionSamples *Samples = ChildNode.getFunctionSamples(); + if (!Samples) + continue; + if (Samples->getTotalSamples() > MaxCalleeSamples) { + ChildNodeRet = &ChildNode; + MaxCalleeSamples = Samples->getTotalSamples(); } } @@ -191,12 +195,12 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, StringRef CalleeName) { LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n"); - // CSFDO-TODO: We use CalleeName to differentiate indirect call - // We need to get sample for indirect callee too. DILocation *DIL = Inst.getDebugLoc(); if (!DIL) return nullptr; + // For indirect call, CalleeName will be empty, in which case the context + // profile for callee with largest total samples will be returned. ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName); if (CalleeContext) { FunctionSamples *FSamples = CalleeContext->getFunctionSamples(); @@ -209,6 +213,26 @@ return nullptr; } +std::vector +SampleContextTracker::getIndirectCalleeContextSamplesFor( + const DILocation *DIL) { + std::vector R; + if (!DIL) + return R; + + ContextTrieNode *CallerNode = getContextFor(DIL); + LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL); + for (auto &It : CallerNode->getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + if (ChildNode.getCallSiteLoc() != CallSite) + continue; + if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples()) + R.push_back(CalleeSamples); + } + + return R; +} + FunctionSamples * SampleContextTracker::getContextSamplesFor(const DILocation *DIL) { assert(DIL && "Expect non-null location"); @@ -295,11 +319,6 @@ const Instruction &Inst, StringRef CalleeName) { LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n" << Inst << "\n"); - // CSFDO-TODO: We also need to promote context profile from indirect - // calls. We won't have callee names from those from call instr. - if (CalleeName.empty()) - return; - // Get the caller context for the call instruction, we don't use callee // name from call because there can be context from indirect calls too. DILocation *DIL = Inst.getDebugLoc(); @@ -307,9 +326,23 @@ if (!CallerNode) return; - // Get the context that needs to be promoted - LineLocation CallSite(FunctionSamples::getOffset(DIL), - DIL->getBaseDiscriminator()); + LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL); + // For indirect call, CalleeName will be empty, in which case we need to + // promote all non-inlined child context profiles. + if (CalleeName.empty()) { + for (auto &It : CallerNode->getAllChildContext()) { + ContextTrieNode *NodeToPromo = &It.second; + if (CallSite != NodeToPromo->getCallSiteLoc()) + continue; + FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples(); + if (FromSamples && FromSamples->getContext().hasState(InlinedContext)) + continue; + promoteMergeContextSamplesTree(*NodeToPromo); + } + return; + } + + // Get the context for the given callee that needs to be promoted ContextTrieNode *NodeToPromo = CallerNode->getChildContext(CallSite, CalleeName); if (!NodeToPromo) @@ -329,6 +362,7 @@ LLVM_DEBUG(dbgs() << " Found context tree root to promote: " << FromSamples->getContext() << "\n"); + assert(!FromSamples->getContext().hasState(InlinedContext)); StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext(); return promoteMergeContextSamplesTree(NodeToPromo, RootContext, ContextStrToRemove); @@ -361,14 +395,12 @@ StringRef CalleeName) { assert(DIL && "Expect non-null location"); - // CSSPGO-TODO: need to support indirect callee - if (CalleeName.empty()) - return nullptr; - ContextTrieNode *CallContext = getContextFor(DIL); if (!CallContext) return nullptr; + // When CalleeName is empty, the child context profile with max + // total samples will be returned. return CallContext->getChildContext( LineLocation(FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()), diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -108,6 +109,14 @@ "Number of functions with CFG mismatched profile"); STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile"); +STATISTIC(NumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(NumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + NumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + // Command line option to specify the file to read samples from. This is // mainly used for debugging. static cl::opt SampleProfileFile( @@ -171,6 +180,38 @@ cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size.")); +static cl::opt ProfileInlineGrowthLimit( + "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12), + cl::desc("The size growth ratio limit for proirity-based sample profile " + "loader inlining.")); + +static cl::opt ProfileInlineLimitMin( + "sample-profile-inline-limit-min", cl::Hidden, cl::init(100), + cl::desc("The lower bound of size growth limit for " + "proirity-based sample profile loader inlining.")); + +static cl::opt ProfileInlineLimitMax( + "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000), + cl::desc("The upper bound of size growth limit for " + "proirity-based sample profile loader inlining.")); + +static cl::opt ProfileICPThreshold( + "sample-profile-icp-threshold", cl::Hidden, cl::init(5), + cl::desc( + "Relative hotness threshold for indirect " + "call promotion in proirity-based sample profile loader inlining.")); + +static cl::opt SampleHotCallSiteThreshold( + "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000), + cl::desc("Hot callsite threshold for proirity-based sample profile loader " + "inlining.")); + +static cl::opt CallsitePrioritizedInline( + "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore, + cl::init(true), + cl::desc("Use call site prioritized inlining for sample profile loader." + "Currently only CSSPGO is supported.")); + static cl::opt SampleColdCallSiteThreshold( "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites")); @@ -313,6 +354,31 @@ DenseMap &CurrentGUIDToFuncNameMap; }; +// Inline candidate used by iterative callsite prioritized inliner +struct InlineCandidate { + CallBase *CallInstr; + const FunctionSamples *CalleeSamples; + uint64_t CallsiteCount; +}; + +// Inline candidate comparer using call site weight +struct CandidateComparer { + bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using CandidateQueue = + PriorityQueue, + CandidateComparer>; + /// Sample profile pass. /// /// This pass reads profile data from the file specified by @@ -351,9 +417,23 @@ findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; mutable DenseMap DILocation2SampleMap; const FunctionSamples *findFunctionSamples(const Instruction &I) const; - bool inlineCallInstruction(CallBase &CB); + CallBase *tryPromoteIndirectCall(Function &F, StringRef CalleeName, + uint64_t &Sum, uint64_t Count, CallBase *I, + const char *&Reason); + bool inlineCallInstruction(CallBase &CB, + const FunctionSamples *CalleeSamples); bool inlineHotFunctions(Function &F, DenseSet &InlinedGUIDs); + // Helper functions call-site prioritized BFS inliner + // Will change the main FDO inliner to be work list based directly in + // upstream, then merge this change with that and remove the duplication. + InlineCost shouldInlineCandidate(InlineCandidate &Candidate); + bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); + bool tryInlineCandidate(InlineCandidate &Candidate, + SmallVector &InlinedCallSites); + bool + inlineHotFunctionsWithPriority(Function &F, + DenseSet &InlinedGUIDs); // Inline cold/small functions in addition to hot ones bool shouldInlineColdCallee(CallBase &CallInst); void emitOptimizationRemarksForInlineCandidates( @@ -917,6 +997,31 @@ return R; } + auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { + assert(L && R && "Expect non-null FunctionSamples"); + if (L->getEntrySamples() != R->getEntrySamples()) + return L->getEntrySamples() > R->getEntrySamples(); + return FunctionSamples::getGUID(L->getName()) < + FunctionSamples::getGUID(R->getName()); + }; + + if (ProfileIsCS) { + auto CalleeSamples = + ContextTracker->getIndirectCalleeContextSamplesFor(DIL); + if (CalleeSamples.empty()) + return R; + + // For CSSPGO, we only use target context profile's entry count + // as that already includes both inlined callee and non-inlined ones.. + Sum = 0; + for (const auto *const FS : CalleeSamples) { + Sum += FS->getEntrySamples(); + R.push_back(FS); + } + llvm::sort(R, FSCompare); + return R; + } + const FunctionSamples *FS = findFunctionSamples(Inst); if (FS == nullptr) return R; @@ -934,12 +1039,7 @@ Sum += NameFS.second.getEntrySamples(); R.push_back(&NameFS.second); } - llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) { - if (L->getEntrySamples() != R->getEntrySamples()) - return L->getEntrySamples() > R->getEntrySamples(); - return FunctionSamples::getGUID(L->getName()) < - FunctionSamples::getGUID(R->getName()); - }); + llvm::sort(R, FSCompare); } return R; } @@ -976,7 +1076,32 @@ return it.first->second; } -bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) { +CallBase * +SampleProfileLoader::tryPromoteIndirectCall(Function &F, StringRef CalleeName, + uint64_t &Sum, uint64_t Count, + CallBase *I, const char *&Reason) { + Reason = "Callee function not available"; + // R->getValue() != &F is to prevent promoting a recursive call. + // If it is a recursive call, we do not inline it as it could bloat + // the code exponentially. There is way to better handle this, e.g. + // clone the caller first, and inline the cloned caller if it is + // recursive. As llvm does not inline recursive calls, we will + // simply ignore it instead of handling it explicitly. + auto R = SymbolMap.find(CalleeName); + if (R != SymbolMap.end() && R->getValue() && + !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && + R->getValue()->hasFnAttribute("use-sample-profile") && + R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) { + auto *DI = + &pgo::promoteIndirectCall(*I, R->getValue(), Count, Sum, false, ORE); + Sum -= Count; + return DI; + } + return nullptr; +} + +bool SampleProfileLoader::inlineCallInstruction( + CallBase &CB, const FunctionSamples *CalleeSamples) { if (ExternalInlineAdvisor) { auto Advice = ExternalInlineAdvisor->getAdvice(CB); if (!Advice->isInliningRecommended()) { @@ -1011,6 +1136,9 @@ // The call to InlineFunction erases I, so we can't pass it here. emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, true, CSINLINE_DEBUG); + if (ProfileIsCS) + ContextTracker->markContextSamplesInlined(CalleeSamples); + ++NumCSInlined; return true; } return false; @@ -1128,34 +1256,17 @@ if (!callsiteIsHot(FS, PSI)) continue; - const char *Reason = "Callee function not available"; - // R->getValue() != &F is to prevent promoting a recursive call. - // If it is a recursive call, we do not inline it as it could bloat - // the code exponentially. There is way to better handle this, e.g. - // clone the caller first, and inline the cloned caller if it is - // recursive. As llvm does not inline recursive calls, we will - // simply ignore it instead of handling it explicitly. + const char *Reason = nullptr; auto CalleeFunctionName = FS->getFuncName(); - auto R = SymbolMap.find(CalleeFunctionName); - if (R != SymbolMap.end() && R->getValue() && - !R->getValue()->isDeclaration() && - R->getValue()->getSubprogram() && - R->getValue()->hasFnAttribute("use-sample-profile") && - R->getValue() != &F && - isLegalToPromote(*I, R->getValue(), &Reason)) { - uint64_t C = FS->getEntrySamples(); - auto &DI = - pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE); - Sum -= C; + if (CallBase *DI = + tryPromoteIndirectCall(F, CalleeFunctionName, Sum, + FS->getEntrySamples(), I, Reason)) { PromotedInsns.insert(I); // If profile mismatches, we should not attempt to inline DI. if ((isa(DI) || isa(DI)) && - inlineCallInstruction(cast(DI))) { - if (ProfileIsCS) - ContextTracker->markContextSamplesInlined(FS); + inlineCallInstruction(cast(*DI), FS)) { localNotInlinedCallSites.erase(I); LocalChanged = true; - ++NumCSInlined; } } else { LLVM_DEBUG(dbgs() @@ -1165,13 +1276,9 @@ } } else if (CalledFunction && CalledFunction->getSubprogram() && !CalledFunction->isDeclaration()) { - if (inlineCallInstruction(*I)) { - if (ProfileIsCS) - ContextTracker->markContextSamplesInlined( - localNotInlinedCallSites[I]); + if (inlineCallInstruction(*I, localNotInlinedCallSites[I])) { localNotInlinedCallSites.erase(I); LocalChanged = true; - ++NumCSInlined; } } else if (IsThinLTOPreLink) { findCalleeFunctionSamples(*I)->findInlinedFunctions( @@ -1231,6 +1338,250 @@ return Changed; } +bool SampleProfileLoader::tryInlineCandidate( + InlineCandidate &Candidate, SmallVector &InlinedCallSites) { + + CallBase &CB = *Candidate.CallInstr; + Function *CalledFunction = CB.getCalledFunction(); + assert(CalledFunction); + DebugLoc DLoc = CB.getDebugLoc(); + BasicBlock *BB = CB.getParent(); + + InlineCost Cost = shouldInlineCandidate(Candidate); + if (Cost.isNever()) { + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB) + << "incompatible inlining"); + return false; + } + + if (!Cost) + return false; + + InlineFunctionInfo IFI(nullptr, GetAC); + if (InlineFunction(CB, IFI).isSuccess()) { + // The call to InlineFunction erases I, so we can't pass it here. + emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, + true, CSINLINE_DEBUG); + + // Now populate the list of newly exposed call sites. + InlinedCallSites.clear(); + for (auto &I : IFI.InlinedCallSites) + InlinedCallSites.push_back(I); + + if (ProfileIsCS) + ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples); + ++NumCSInlined; + return true; + } + return false; +} + +bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, + CallBase *CB) { + assert(CB); + + if (isa(CB)) + return false; + + // Find the callee's profile. For indirect call, find hottest target profile. + const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB); + if (!CalleeSamples) + return false; + + uint64_t CallsiteCount = 0; + ErrorOr Weight = getBlockWeight(CB->getParent()); + if (Weight) + CallsiteCount = Weight.get(); + if (CalleeSamples) + CallsiteCount = std::max(CallsiteCount, CalleeSamples->getEntrySamples()); + + *NewCandidate = {CB, CalleeSamples, CallsiteCount}; + return true; +} + +InlineCost +SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { + assert(ProfileIsCS); + + std::unique_ptr Advice = nullptr; + if (ExternalInlineAdvisor) { + Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr); + if (!Advice->isInliningRecommended()) { + Advice->recordUnattemptedInlining(); + return InlineCost::getNever("not previously inlined"); + } + Advice->recordInlining(); + return InlineCost::getAlways("previously inlined"); + } + + // Adjust threshold based on call site hotness, only do this for callsite + // prioritized inliner because otherwise cost-benefit check is done earlier. + int SampleThreshold = SampleColdCallSiteThreshold; + if (CallsitePrioritizedInline) { + if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) + SampleThreshold = SampleHotCallSiteThreshold; + else if (!ProfileSizeInline) + return InlineCost::getNever("cold callsite"); + } + + Function *Callee = Candidate.CallInstr->getCalledFunction(); + assert(Callee && "Expect a definition for inline candidate of direct call"); + + InlineParams Params = getInlineParams(); + Params.ComputeFullInlineCost = true; + // Checks if there is anything in the reachable portion of the callee at + // this callsite that makes this inlining potentially illegal. Need to + // set ComputeFullInlineCost, otherwise getInlineCost may return early + // when cost exceeds threshold without checking all IRs in the callee. + // The acutal cost does not matter because we only checks isNever() to + // see if it is legal to inline the callsite. + InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params, + GetTTI(*Callee), GetAC, GetTLI); + + // For old FDO inliner, we inline the call site as long as cost is not + // "Never". The cost-benefit check is done earlier. + if (!CallsitePrioritizedInline) { + if (Cost.isNever()) + return Cost; + return InlineCost::getAlways("hot callsite previously inlined"); + } + + // Honor always inline and never inline from call analyzer + if (Cost.isNever() || Cost.isAlways()) + return Cost; + + // Otherwise only use the cost from call analyzer, but overwite threshold with + // Sample PGO threshold. + return InlineCost::get(Cost.getCost(), SampleThreshold); +} + +bool SampleProfileLoader::inlineHotFunctionsWithPriority( + Function &F, DenseSet &InlinedGUIDs) { + DenseSet PromotedInsns; + assert(ProfileIsCS); + + // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure + // Profile symbol list is ignored when profile-sample-accurate is on. + assert((!ProfAccForSymsInList || + (!ProfileSampleAccurate && + !F.hasFnAttribute("profile-sample-accurate"))) && + "ProfAccForSymsInList should be false when profile-sample-accurate " + "is enabled"); + + // Populating worklist with initial call sites from root inliner, along + // with call site weights. + CandidateQueue CQueue; + InlineCandidate NewCandidate; + for (auto &BB : F) { + for (auto &I : BB.getInstList()) { + auto *CB = dyn_cast(&I); + if (!CB) + continue; + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.push(NewCandidate); + } + } + + // Cap the size growth from profile guided inlining. This is needed even + // though cost of each inline candidate already accounts for callee size, + // because with top-down inlining, we can grow inliner size significantly + // with large number of smaller inlinees each pass the cost check. + assert(ProfileInlineLimitMax >= ProfileInlineLimitMin); + unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + if (ExternalInlineAdvisor) + SizeLimit = std::numeric_limits::max(); + + // Perform iterative BFS call site prioritized inlining + bool Changed = false; + while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { + InlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + CallBase *I = Candidate.CallInstr; + Function *CalledFunction = I->getCalledFunction(); + + if (CalledFunction == &F) + continue; + if (I->isIndirectCall()) { + if (PromotedInsns.count(I)) + continue; + uint64_t Sum; + auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum); + uint64_t SumOrigin = Sum; + for (const auto *FS : CalleeSamples) { + // TODO: Consider disable pre-lTO ICP for MonoLTO as well + if (IsThinLTOPreLink) { + FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), + PSI->getOrCompHotCountThreshold()); + continue; + } + uint64_t EntryCountDistributed = FS->getEntrySamples(); + // In addition to regular inline cost check, we also need to make sure + // ICP isn't introducing excessive speculative checks even if individual + // target looks beneficial to promote and inline. That means we should + // only do ICP when there's a small number dominant targets. + if (EntryCountDistributed < SumOrigin / ProfileICPThreshold) + break; + // For indirect call, we don't run CallAnalyzer through InlineCost + // before actual inlining to work around PR18962. However, that means we + // may do ICP first and later decided not to inline, which is mostly ok + // for perf. + if (!PSI->isHotCount(EntryCountDistributed)) + break; + const char *Reason = nullptr; + auto CalleeFunctionName = FS->getFuncName(); + if (CallBase *DI = tryPromoteIndirectCall( + F, CalleeFunctionName, Sum, EntryCountDistributed, I, Reason)) { + // Attach function profile for selected indirect callee, and update + // call site count for the selected target too. Speculatively check + // if it's beneficial to inline the callee to decide whether to ICP. + Candidate = {DI, FS, EntryCountDistributed}; + PromotedInsns.insert(I); + SmallVector InlinedCallSites; + // If profile mismatches, we should not attempt to inline DI. + if ((isa(DI) || isa(DI)) && + tryInlineCandidate(Candidate, InlinedCallSites)) { + for (auto *CB : InlinedCallSites) { + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.emplace(NewCandidate); + } + Changed = true; + } + } else { + LLVM_DEBUG(dbgs() + << "\nFailed to promote indirect call to " + << CalleeFunctionName << " because " << Reason << "\n"); + } + } + } else if (CalledFunction && CalledFunction->getSubprogram() && + !CalledFunction->isDeclaration()) { + SmallVector InlinedCallSites; + if (tryInlineCandidate(Candidate, InlinedCallSites)) { + for (auto *CB : InlinedCallSites) { + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.emplace(NewCandidate); + } + Changed = true; + } + } else if (IsThinLTOPreLink) { + findCalleeFunctionSamples(*I)->findInlinedFunctions( + InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); + } + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++NumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++NumCSInlinedHitMinLimit; + else + ++NumCSInlinedHitGrowthLimit; + } + + return Changed; +} + /// Find equivalence classes for the given block. /// /// This finds all the blocks that are guaranteed to execute the same @@ -1832,7 +2183,10 @@ } DenseSet InlinedGUIDs; - Changed |= inlineHotFunctions(F, InlinedGUIDs); + if (ProfileIsCS && CallsitePrioritizedInline) + Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); + else + Changed |= inlineHotFunctions(F, InlinedGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); @@ -1967,6 +2321,9 @@ // Apply tweaks if context-sensitive profile is available. if (Reader->profileIsCS()) { + // Default to size inline on with CSSPGO. + if (!ProfileSizeInline.getNumOccurrences()) + ProfileSizeInline = true; ProfileIsCS = true; FunctionSamples::ProfileIsCS = true; diff --git a/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof b/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof @@ -0,0 +1,10 @@ +[test]:63067:0 + 1: 3345 _Z3barv:1398 _Z3foov:2059 + 2: 100 _Z3bazv:102 + 3: 100 _Z3zoov:102 +[test:1 @ _Z3barv]:200:100 + 1: 100 +[test:1 @ _Z3foov]:4220:1200 + 14: 4220 +[test:2 @ _Z3bazv]:200:100 + 5: 100 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll copy from llvm/test/Transforms/SampleProfile/profile-context-tracker.ll copy to llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll @@ -1,25 +1,28 @@ -; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly -; based on inline decision, so post inline counts are accurate. +; REQUIRES: asserts +; Test that the new FDO inliner using prioty queue will not visit same call site again and again. +; Use debug prints as repeated call site evaluation is not visible from final inline decision. ; Note that we need new pass manager to enable top-down processing for sample profile loader -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi -; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL - -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi -; _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT - +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=OLD-INLINE +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=1 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=NEW-INLINE + +; Old inliner will evaluate the same call site three times +; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi +; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi +; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi + +; New inliner only evaluate the same call site once +; NEW-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; NEW-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi +; NEW-INLINE-NOT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; NEW-INLINE-NOT: Callee context found: main:3.1 @ _Z5funcBi @factor = dso_local global i32 3, align 4, !dbg !0 define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { -; INLINE-ALL: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] -; INLINE-HOT: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] entry: br label %for.body, !dbg !25 @@ -30,13 +33,8 @@ %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 -; _Z5funcBi is marked noinline -; INLINE-ALL: call i32 @_Z5funcBi -; INLINE-HOT: call i32 @_Z5funcBi %add = add nuw nsw i32 %x.011, 1, !dbg !31 %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 -; INLINE-ALL-NOT: call i32 @_Z5funcAi -; INLINE-HOT: call i32 @_Z5funcAi %add2 = add i32 %call, %r.010, !dbg !34 %add3 = add i32 %add2, %call1, !dbg !35 %dec = add nsw i32 %x.011, -1, !dbg !36 @@ -45,25 +43,13 @@ } define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 { -; _Z5funcAi is inlined, so outline remainder should have zero counts -; INLINE-ALL: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] entry: %add = add nsw i32 %x, 100000, !dbg !44 -; _Z8funcLeafi is already inlined on main->_Z5funcAi->_Z8funcLeafi, -; so it should not be inlined on _Z5funcAi->_Z8funcLeafi based on updated -; (merged and promoted) context profile -; INLINE-ALL: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45 ret i32 %call, !dbg !46 } define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 { -; main->_Z5funcAi->_Z8funcLeafi is inlined, and _Z5funcBi->_Z8funcLeafi is also -; inlined, so outline remainder should have empty profile -; INLINE-ALL: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] -; INLINE-HOT: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] entry: %cmp = icmp sgt i32 %x, 0, !dbg !57 br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59 @@ -94,29 +80,12 @@ } define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 { -; _Z5funcBi is marked noinline, so outline remainder has promoted context profile -; INLINE-ALL: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] entry: %sub = add nsw i32 %x, -100000, !dbg !51 %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52 -; _Z5funcBi is not inlined into main, so we main->_Z5funcBi->_Z8funcLeafi -; should be inlined based on promoted context profile -; INLINE-ALL-NOT: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi ret i32 %call, !dbg !53 } -; INLINE-ALL-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-ALL-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - -; INLINE-HOT-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-HOT-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 12} -; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - declare i32 @_Z3fibi(i32) attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s + +define void @test(void ()*) #0 !dbg !3 { +;; Add two direct call to force top-down order for sample profile loader + call void @_Z3foov(), !dbg !7 + call void @_Z3barv(), !dbg !7 + call void @_Z3bazv(), !dbg !7 + %2 = alloca void ()* + store void ()* %0, void ()** %2 + %3 = load void ()*, void ()** %2 + call void %3(), !dbg !4 + %4 = alloca void ()* + store void ()* %0, void ()** %4 + %5 = load void ()*, void ()** %4 + call void %5(), !dbg !5 + ret void +} + +define void @_Z3foov() #0 !dbg !8 { + ret void +} + +define void @_Z3barv() #0 !dbg !9 { + ret void +} + +define void @_Z3bazv() #0 !dbg !10 { + ret void +} + +define void @_Z3zoov() #0 !dbg !11 { + ret void +} + +attributes #0 = {"use-sample-profile"} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1) +!1 = !DIFile(filename: "test.cc", directory: "/") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 3, unit: !0) +!4 = !DILocation(line: 4, scope: !3) +!5 = !DILocation(line: 5, scope: !3) +!6 = !DILocation(line: 6, scope: !3) +!7 = !DILocation(line: 7, scope: !3) +!8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 29, unit: !0) +!9 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 32, unit: !0) +!10 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 24, unit: !0) +!11 = distinct !DISubprogram(name: "zoo", linkageName: "_Z3zoov", scope: !1, file: !1, line: 24, unit: !0) + + +; ICP-ALL: remark: test.cc:5:0: _Z3bazv inlined into test +; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3foov inlined into test +; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3barv inlined into test +; ICP-ALL-NOT: remark + +; ICP-HOT: remark: test.cc:4:0: _Z3foov inlined into test +; ICP-HOT-NOT: remark diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll copy from llvm/test/Transforms/SampleProfile/profile-context-tracker.ll copy to llvm/test/Transforms/SampleProfile/csspgo-inline.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll @@ -1,25 +1,42 @@ -; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly -; based on inline decision, so post inline counts are accurate. +; Test for CSSPGO's new early inliner using priority queue ; Note that we need new pass manager to enable top-down processing for sample profile loader -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile +; Test we inlined the following in top-down order with old inliner ; main:3 @ _Z5funcAi ; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL - -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi -; _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT - +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE +; +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, so we get less inlining for given profile +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW +; +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning hot cutoff can get us the same inlining +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE +; +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning cold sample profile inline threshold can get us the same inlining +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE +; +; With new FDO early inliner and tuned cutoff, we can control inlining through size growth tuning knob. +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1 +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2 + + +; INLINE-BASE: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3 +; INLINE-BASE: remark: merged.cpp:27:11: _Z8funcLeafi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1 @ main:3 +; INLINE-BASE: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1 + +; INLINE-NEW: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3 +; INLINE-NEW-NOT: remark + +; INLINE-NEW-LIMIT1-NOT: remark + +; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1 +; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1 +; INLINE-NEW-LIMIT2-NOT: remark @factor = dso_local global i32 3, align 4, !dbg !0 define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { -; INLINE-ALL: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] -; INLINE-HOT: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] entry: br label %for.body, !dbg !25 @@ -30,13 +47,8 @@ %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 -; _Z5funcBi is marked noinline -; INLINE-ALL: call i32 @_Z5funcBi -; INLINE-HOT: call i32 @_Z5funcBi %add = add nuw nsw i32 %x.011, 1, !dbg !31 %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 -; INLINE-ALL-NOT: call i32 @_Z5funcAi -; INLINE-HOT: call i32 @_Z5funcAi %add2 = add i32 %call, %r.010, !dbg !34 %add3 = add i32 %add2, %call1, !dbg !35 %dec = add nsw i32 %x.011, -1, !dbg !36 @@ -45,25 +57,13 @@ } define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 { -; _Z5funcAi is inlined, so outline remainder should have zero counts -; INLINE-ALL: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] entry: %add = add nsw i32 %x, 100000, !dbg !44 -; _Z8funcLeafi is already inlined on main->_Z5funcAi->_Z8funcLeafi, -; so it should not be inlined on _Z5funcAi->_Z8funcLeafi based on updated -; (merged and promoted) context profile -; INLINE-ALL: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45 ret i32 %call, !dbg !46 } define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 { -; main->_Z5funcAi->_Z8funcLeafi is inlined, and _Z5funcBi->_Z8funcLeafi is also -; inlined, so outline remainder should have empty profile -; INLINE-ALL: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] -; INLINE-HOT: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] entry: %cmp = icmp sgt i32 %x, 0, !dbg !57 br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59 @@ -94,29 +94,12 @@ } define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 { -; _Z5funcBi is marked noinline, so outline remainder has promoted context profile -; INLINE-ALL: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] entry: %sub = add nsw i32 %x, -100000, !dbg !51 %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52 -; _Z5funcBi is not inlined into main, so we main->_Z5funcBi->_Z8funcLeafi -; should be inlined based on promoted context profile -; INLINE-ALL-NOT: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi ret i32 %call, !dbg !53 } -; INLINE-ALL-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-ALL-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - -; INLINE-HOT-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-HOT-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 12} -; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - declare i32 @_Z3fibi(i32) attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll +++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll @@ -3,11 +3,11 @@ ; based on inline decision, so post inline counts are accurate. ; Note that we need new pass manager to enable top-down processing for sample profile loader -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-prioritized-inline=0 -sample-profile-inline-size=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT -; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile +; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile ; main:3 @ _Z5funcAi ; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi @@ -20,13 +20,9 @@ ; INLINE-ALL-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi ; INLINE-ALL-NEXT: Callee context found: main:3 @ _Z5funcAi ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi -; INLINE-ALL-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi( -; INLINE-ALL-NEXT: Callee context found: main:3.1 @ _Z5funcBi ; INLINE-ALL-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z8funcLeafi ; INLINE-ALL-NEXT: Callee context found: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi -; INLINE-ALL-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi -; INLINE-ALL-NEXT: Callee context found: main:3.1 @ _Z5funcBi ; INLINE-ALL-NEXT: Getting callee context for instr: %call.i1 = tail call i32 @_Z3fibi ; INLINE-ALL-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi ; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi @@ -48,24 +44,23 @@ ; INLINE-ALL-NEXT: Getting base profile for function: _Z8funcLeafi ; INLINE-ALL-NEXT: Merging context profile into base profile: _Z8funcLeafi -; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile -; main:3 @ _Z5funcAi +; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile ; _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi ; INLINE-HOT: Getting base profile for function: main ; INLINE-HOT-NEXT: Merging context profile into base profile: main ; INLINE-HOT-NEXT: Found context tree root to promote: external:12 @ main ; INLINE-HOT-NEXT: Context promoted and merged to: main -; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !58 +; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi ; INLINE-HOT-NEXT: Callee context found: main:3.1 @ _Z5funcBi -; INLINE-HOT-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !63 +; INLINE-HOT-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi ; INLINE-HOT-NEXT: Callee context found: main:3 @ _Z5funcAi ; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi ; INLINE-HOT-NEXT: Merging context profile into base profile: _Z5funcAi ; INLINE-HOT-NEXT: Found context tree root to promote: main:3 @ _Z5funcAi ; INLINE-HOT-NEXT: Context promoted to: _Z5funcAi ; INLINE-HOT-NEXT: Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi -; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50 +; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50 ; INLINE-HOT-NEXT: Callee context found: _Z5funcAi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62 @@ -79,11 +74,11 @@ ; INLINE-HOT-NEXT: Context promoted to: _Z5funcBi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Found context tree root to promote: externalA:17 @ _Z5funcBi ; INLINE-HOT-NEXT: Context promoted and merged to: _Z5funcBi -; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !50 +; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi ; INLINE-HOT-NEXT: Callee context found: _Z5funcBi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi -; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62 -; INLINE-HOT-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69 +; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi +; INLINE-HOT-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi ; INLINE-HOT-NEXT: Getting base profile for function: _Z8funcLeafi ; INLINE-HOT-NEXT: Merging context profile into base profile: _Z8funcLeafi diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll +++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll @@ -6,13 +6,14 @@ ; main:3 @ _Z5funcAi ; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, so we need to tune down cold threshold to get the same inlining. +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi +; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile ; _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size=0 -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT @factor = dso_local global i32 3, align 4, !dbg !0