diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -23,6 +23,7 @@ #include "llvm/ProfileData/SampleProf.h" #include #include +#include using namespace llvm; using namespace sampleprof; @@ -42,7 +43,7 @@ CallSiteLoc(CallLoc){}; ContextTrieNode *getChildContext(const LineLocation &CallSite, StringRef CalleeName); - ContextTrieNode *getChildContext(const LineLocation &CallSite); + ContextTrieNode *getHottestChildContext(const LineLocation &CallSite); ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate = true); @@ -94,6 +95,9 @@ // call-site. The full context is identified by location of call instruction. FunctionSamples *getCalleeContextSamplesFor(const CallBase &Inst, StringRef CalleeName); + // Get samples for indirect call targets for call site at given location. + std::vector + getIndirectCalleeContextSamplesFor(const DILocation *DIL); // Query context profile for a given location. The full context // is identified by input DILocation. FunctionSamples *getContextSamplesFor(const DILocation *DIL); diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -30,7 +30,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, StringRef CalleeName) { if (CalleeName.empty()) - return getChildContext(CallSite); + return getHottestChildContext(CallSite); uint32_t Hash = nodeHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); @@ -40,18 +40,22 @@ } ContextTrieNode * -ContextTrieNode::getChildContext(const LineLocation &CallSite) { +ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { // CSFDO-TODO: This could be slow, change AllChildContext so we can // do point look up for child node by call site alone. - // CSFDO-TODO: Return the child with max count for indirect call + // Retrieve the child node with max count for indirect call ContextTrieNode *ChildNodeRet = nullptr; + uint64_t MaxCalleeSamples = 0; for (auto &It : AllChildContext) { ContextTrieNode &ChildNode = It.second; - if (ChildNode.CallSiteLoc == CallSite) { - if (ChildNodeRet) - return nullptr; - else - ChildNodeRet = &ChildNode; + if (ChildNode.CallSiteLoc != CallSite) + continue; + FunctionSamples *Samples = ChildNode.getFunctionSamples(); + if (!Samples) + continue; + if (Samples->getTotalSamples() > MaxCalleeSamples) { + ChildNodeRet = &ChildNode; + MaxCalleeSamples = Samples->getTotalSamples(); } } @@ -191,12 +195,12 @@ SampleContextTracker::getCalleeContextSamplesFor(const CallBase &Inst, StringRef CalleeName) { LLVM_DEBUG(dbgs() << "Getting callee context for instr: " << Inst << "\n"); - // CSFDO-TODO: We use CalleeName to differentiate indirect call - // We need to get sample for indirect callee too. DILocation *DIL = Inst.getDebugLoc(); if (!DIL) return nullptr; + // For indirect call, CalleeName will be empty, in which case the context + // profile for callee with largest total samples will be returned. ContextTrieNode *CalleeContext = getCalleeContextFor(DIL, CalleeName); if (CalleeContext) { FunctionSamples *FSamples = CalleeContext->getFunctionSamples(); @@ -209,6 +213,26 @@ return nullptr; } +std::vector +SampleContextTracker::getIndirectCalleeContextSamplesFor( + const DILocation *DIL) { + std::vector R; + if (!DIL) + return R; + + ContextTrieNode *CallerNode = getContextFor(DIL); + LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL); + for (auto &It : CallerNode->getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + if (ChildNode.getCallSiteLoc() != CallSite) + continue; + if (FunctionSamples *CalleeSamples = ChildNode.getFunctionSamples()) + R.push_back(CalleeSamples); + } + + return R; +} + FunctionSamples * SampleContextTracker::getContextSamplesFor(const DILocation *DIL) { assert(DIL && "Expect non-null location"); @@ -295,11 +319,6 @@ const Instruction &Inst, StringRef CalleeName) { LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n" << Inst << "\n"); - // CSFDO-TODO: We also need to promote context profile from indirect - // calls. We won't have callee names from those from call instr. - if (CalleeName.empty()) - return; - // Get the caller context for the call instruction, we don't use callee // name from call because there can be context from indirect calls too. DILocation *DIL = Inst.getDebugLoc(); @@ -309,6 +328,22 @@ // Get the context that needs to be promoted LineLocation CallSite = FunctionSamples::getCallSiteIdentifier(DIL); + // For indirect call, CalleeName will be empty, in which case we need to + // promote all non-inlined child context profiles. + if (CalleeName.empty()) { + for (auto &It : CallerNode->getAllChildContext()) { + ContextTrieNode *NodeToPromo = &It.second; + if (CallSite != NodeToPromo->getCallSiteLoc()) + continue; + FunctionSamples *FromSamples = NodeToPromo->getFunctionSamples(); + if (FromSamples && FromSamples->getContext().hasState(InlinedContext)) + continue; + promoteMergeContextSamplesTree(*NodeToPromo); + } + return; + } + + // Get the context for the given callee that needs to be promoted ContextTrieNode *NodeToPromo = CallerNode->getChildContext(CallSite, CalleeName); if (!NodeToPromo) @@ -328,6 +363,8 @@ LLVM_DEBUG(dbgs() << " Found context tree root to promote: " << FromSamples->getContext() << "\n"); + assert(!FromSamples->getContext().hasState(InlinedContext) && + "Shouldn't promote inlined context profile"); StringRef ContextStrToRemove = FromSamples->getContext().getCallingContext(); return promoteMergeContextSamplesTree(NodeToPromo, RootContext, ContextStrToRemove); @@ -360,14 +397,12 @@ StringRef CalleeName) { assert(DIL && "Expect non-null location"); - // CSSPGO-TODO: need to support indirect callee - if (CalleeName.empty()) - return nullptr; - ContextTrieNode *CallContext = getContextFor(DIL); if (!CallContext) return nullptr; + // When CalleeName is empty, the child context profile with max + // total samples will be returned. return CallContext->getChildContext( FunctionSamples::getCallSiteIdentifier(DIL), CalleeName); } diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/PriorityQueue.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -108,6 +109,14 @@ "Number of functions with CFG mismatched profile"); STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile"); +STATISTIC(NumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(NumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + NumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + // Command line option to specify the file to read samples from. This is // mainly used for debugging. static cl::opt SampleProfileFile( @@ -171,6 +180,38 @@ cl::desc("Inline cold call sites in profile loader if it's beneficial " "for code size.")); +static cl::opt ProfileInlineGrowthLimit( + "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12), + cl::desc("The size growth ratio limit for proirity-based sample profile " + "loader inlining.")); + +static cl::opt ProfileInlineLimitMin( + "sample-profile-inline-limit-min", cl::Hidden, cl::init(100), + cl::desc("The lower bound of size growth limit for " + "proirity-based sample profile loader inlining.")); + +static cl::opt ProfileInlineLimitMax( + "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000), + cl::desc("The upper bound of size growth limit for " + "proirity-based sample profile loader inlining.")); + +static cl::opt ProfileICPThreshold( + "sample-profile-icp-threshold", cl::Hidden, cl::init(5), + cl::desc( + "Relative hotness threshold for indirect " + "call promotion in proirity-based sample profile loader inlining.")); + +static cl::opt SampleHotCallSiteThreshold( + "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000), + cl::desc("Hot callsite threshold for proirity-based sample profile loader " + "inlining.")); + +static cl::opt CallsitePrioritizedInline( + "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore, + cl::init(false), + cl::desc("Use call site prioritized inlining for sample profile loader." + "Currently only CSSPGO is supported.")); + static cl::opt SampleColdCallSiteThreshold( "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining cold callsites")); @@ -313,6 +354,31 @@ DenseMap &CurrentGUIDToFuncNameMap; }; +// Inline candidate used by iterative callsite prioritized inliner +struct InlineCandidate { + CallBase *CallInstr; + const FunctionSamples *CalleeSamples; + uint64_t CallsiteCount; +}; + +// Inline candidate comparer using call site weight +struct CandidateComparer { + bool operator()(const InlineCandidate &LHS, const InlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using CandidateQueue = + PriorityQueue, + CandidateComparer>; + /// Sample profile pass. /// /// This pass reads profile data from the file specified by @@ -350,9 +416,23 @@ findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; mutable DenseMap DILocation2SampleMap; const FunctionSamples *findFunctionSamples(const Instruction &I) const; - bool inlineCallInstruction(CallBase &CB); + CallBase *tryPromoteIndirectCall(Function &F, StringRef CalleeName, + uint64_t &Sum, uint64_t Count, CallBase *I, + const char *&Reason); + bool inlineCallInstruction(CallBase &CB, + const FunctionSamples *CalleeSamples); bool inlineHotFunctions(Function &F, DenseSet &InlinedGUIDs); + // Helper functions call-site prioritized BFS inliner + // Will change the main FDO inliner to be work list based directly in + // upstream, then merge this change with that and remove the duplication. + InlineCost shouldInlineCandidate(InlineCandidate &Candidate); + bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB); + bool tryInlineCandidate(InlineCandidate &Candidate, + SmallVector &InlinedCallSites); + bool + inlineHotFunctionsWithPriority(Function &F, + DenseSet &InlinedGUIDs); // Inline cold/small functions in addition to hot ones bool shouldInlineColdCallee(CallBase &CallInst); void emitOptimizationRemarksForInlineCandidates( @@ -918,6 +998,31 @@ return R; } + auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) { + assert(L && R && "Expect non-null FunctionSamples"); + if (L->getEntrySamples() != R->getEntrySamples()) + return L->getEntrySamples() > R->getEntrySamples(); + return FunctionSamples::getGUID(L->getName()) < + FunctionSamples::getGUID(R->getName()); + }; + + if (ProfileIsCS) { + auto CalleeSamples = + ContextTracker->getIndirectCalleeContextSamplesFor(DIL); + if (CalleeSamples.empty()) + return R; + + // For CSSPGO, we only use target context profile's entry count + // as that already includes both inlined callee and non-inlined ones.. + Sum = 0; + for (const auto *const FS : CalleeSamples) { + Sum += FS->getEntrySamples(); + R.push_back(FS); + } + llvm::sort(R, FSCompare); + return R; + } + const FunctionSamples *FS = findFunctionSamples(Inst); if (FS == nullptr) return R; @@ -935,12 +1040,7 @@ Sum += NameFS.second.getEntrySamples(); R.push_back(&NameFS.second); } - llvm::sort(R, [](const FunctionSamples *L, const FunctionSamples *R) { - if (L->getEntrySamples() != R->getEntrySamples()) - return L->getEntrySamples() > R->getEntrySamples(); - return FunctionSamples::getGUID(L->getName()) < - FunctionSamples::getGUID(R->getName()); - }); + llvm::sort(R, FSCompare); } return R; } @@ -977,7 +1077,32 @@ return it.first->second; } -bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) { +CallBase * +SampleProfileLoader::tryPromoteIndirectCall(Function &F, StringRef CalleeName, + uint64_t &Sum, uint64_t Count, + CallBase *I, const char *&Reason) { + Reason = "Callee function not available"; + // R->getValue() != &F is to prevent promoting a recursive call. + // If it is a recursive call, we do not inline it as it could bloat + // the code exponentially. There is way to better handle this, e.g. + // clone the caller first, and inline the cloned caller if it is + // recursive. As llvm does not inline recursive calls, we will + // simply ignore it instead of handling it explicitly. + auto R = SymbolMap.find(CalleeName); + if (R != SymbolMap.end() && R->getValue() && + !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && + R->getValue()->hasFnAttribute("use-sample-profile") && + R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) { + auto *DI = + &pgo::promoteIndirectCall(*I, R->getValue(), Count, Sum, false, ORE); + Sum -= Count; + return DI; + } + return nullptr; +} + +bool SampleProfileLoader::inlineCallInstruction( + CallBase &CB, const FunctionSamples *CalleeSamples) { if (ExternalInlineAdvisor) { auto Advice = ExternalInlineAdvisor->getAdvice(CB); if (!Advice->isInliningRecommended()) { @@ -1012,6 +1137,9 @@ // The call to InlineFunction erases I, so we can't pass it here. emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, true, CSINLINE_DEBUG); + if (ProfileIsCS) + ContextTracker->markContextSamplesInlined(CalleeSamples); + ++NumCSInlined; return true; } return false; @@ -1129,34 +1257,17 @@ if (!callsiteIsHot(FS, PSI)) continue; - const char *Reason = "Callee function not available"; - // R->getValue() != &F is to prevent promoting a recursive call. - // If it is a recursive call, we do not inline it as it could bloat - // the code exponentially. There is way to better handle this, e.g. - // clone the caller first, and inline the cloned caller if it is - // recursive. As llvm does not inline recursive calls, we will - // simply ignore it instead of handling it explicitly. + const char *Reason = nullptr; auto CalleeFunctionName = FS->getFuncName(); - auto R = SymbolMap.find(CalleeFunctionName); - if (R != SymbolMap.end() && R->getValue() && - !R->getValue()->isDeclaration() && - R->getValue()->getSubprogram() && - R->getValue()->hasFnAttribute("use-sample-profile") && - R->getValue() != &F && - isLegalToPromote(*I, R->getValue(), &Reason)) { - uint64_t C = FS->getEntrySamples(); - auto &DI = - pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE); - Sum -= C; + if (CallBase *DI = + tryPromoteIndirectCall(F, CalleeFunctionName, Sum, + FS->getEntrySamples(), I, Reason)) { PromotedInsns.insert(I); // If profile mismatches, we should not attempt to inline DI. if ((isa(DI) || isa(DI)) && - inlineCallInstruction(cast(DI))) { - if (ProfileIsCS) - ContextTracker->markContextSamplesInlined(FS); + inlineCallInstruction(cast(*DI), FS)) { localNotInlinedCallSites.erase(I); LocalChanged = true; - ++NumCSInlined; } } else { LLVM_DEBUG(dbgs() @@ -1166,13 +1277,11 @@ } } else if (CalledFunction && CalledFunction->getSubprogram() && !CalledFunction->isDeclaration()) { - if (inlineCallInstruction(*I)) { - if (ProfileIsCS) - ContextTracker->markContextSamplesInlined( - localNotInlinedCallSites[I]); + if (inlineCallInstruction(*I, localNotInlinedCallSites.count(I) + ? localNotInlinedCallSites[I] + : nullptr)) { localNotInlinedCallSites.erase(I); LocalChanged = true; - ++NumCSInlined; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { findCalleeFunctionSamples(*I)->findInlinedFunctions( @@ -1186,6 +1295,11 @@ } } + // For CS profile, profile for not inlined context will be merged when + // base profile is being trieved + if (ProfileIsCS) + return Changed; + // Accumulate not inlined callsite information into notInlinedSamples for (const auto &Pair : localNotInlinedCallSites) { CallBase *I = Pair.getFirst(); @@ -1232,6 +1346,254 @@ return Changed; } +bool SampleProfileLoader::tryInlineCandidate( + InlineCandidate &Candidate, SmallVector &InlinedCallSites) { + + CallBase &CB = *Candidate.CallInstr; + Function *CalledFunction = CB.getCalledFunction(); + assert(CalledFunction && "Expect a callee with definition"); + DebugLoc DLoc = CB.getDebugLoc(); + BasicBlock *BB = CB.getParent(); + + InlineCost Cost = shouldInlineCandidate(Candidate); + if (Cost.isNever()) { + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB) + << "incompatible inlining"); + return false; + } + + if (!Cost) + return false; + + InlineFunctionInfo IFI(nullptr, GetAC); + if (InlineFunction(CB, IFI).isSuccess()) { + // The call to InlineFunction erases I, so we can't pass it here. + emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost, + true, CSINLINE_DEBUG); + + // Now populate the list of newly exposed call sites. + InlinedCallSites.clear(); + for (auto &I : IFI.InlinedCallSites) + InlinedCallSites.push_back(I); + + if (ProfileIsCS) + ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples); + ++NumCSInlined; + return true; + } + return false; +} + +bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate, + CallBase *CB) { + assert(CB && "Expect non-null call instruction"); + + if (isa(CB)) + return false; + + // Find the callee's profile. For indirect call, find hottest target profile. + const FunctionSamples *CalleeSamples = findCalleeFunctionSamples(*CB); + if (!CalleeSamples) + return false; + + uint64_t CallsiteCount = 0; + ErrorOr Weight = getBlockWeight(CB->getParent()); + if (Weight) + CallsiteCount = Weight.get(); + if (CalleeSamples) + CallsiteCount = std::max(CallsiteCount, CalleeSamples->getEntrySamples()); + + *NewCandidate = {CB, CalleeSamples, CallsiteCount}; + return true; +} + +InlineCost +SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { + assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); + + std::unique_ptr Advice = nullptr; + if (ExternalInlineAdvisor) { + Advice = ExternalInlineAdvisor->getAdvice(*Candidate.CallInstr); + if (!Advice->isInliningRecommended()) { + Advice->recordUnattemptedInlining(); + return InlineCost::getNever("not previously inlined"); + } + Advice->recordInlining(); + return InlineCost::getAlways("previously inlined"); + } + + // Adjust threshold based on call site hotness, only do this for callsite + // prioritized inliner because otherwise cost-benefit check is done earlier. + int SampleThreshold = SampleColdCallSiteThreshold; + if (CallsitePrioritizedInline) { + if (Candidate.CallsiteCount > PSI->getHotCountThreshold()) + SampleThreshold = SampleHotCallSiteThreshold; + else if (!ProfileSizeInline) + return InlineCost::getNever("cold callsite"); + } + + Function *Callee = Candidate.CallInstr->getCalledFunction(); + assert(Callee && "Expect a definition for inline candidate of direct call"); + + InlineParams Params = getInlineParams(); + Params.ComputeFullInlineCost = true; + // Checks if there is anything in the reachable portion of the callee at + // this callsite that makes this inlining potentially illegal. Need to + // set ComputeFullInlineCost, otherwise getInlineCost may return early + // when cost exceeds threshold without checking all IRs in the callee. + // The acutal cost does not matter because we only checks isNever() to + // see if it is legal to inline the callsite. + InlineCost Cost = getInlineCost(*Candidate.CallInstr, Callee, Params, + GetTTI(*Callee), GetAC, GetTLI); + + // For old FDO inliner, we inline the call site as long as cost is not + // "Never". The cost-benefit check is done earlier. + if (!CallsitePrioritizedInline) { + if (Cost.isNever()) + return Cost; + return InlineCost::getAlways("hot callsite previously inlined"); + } + + // Honor always inline and never inline from call analyzer + if (Cost.isNever() || Cost.isAlways()) + return Cost; + + // Otherwise only use the cost from call analyzer, but overwite threshold with + // Sample PGO threshold. + return InlineCost::get(Cost.getCost(), SampleThreshold); +} + +bool SampleProfileLoader::inlineHotFunctionsWithPriority( + Function &F, DenseSet &InlinedGUIDs) { + DenseSet PromotedInsns; + assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); + + // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure + // Profile symbol list is ignored when profile-sample-accurate is on. + assert((!ProfAccForSymsInList || + (!ProfileSampleAccurate && + !F.hasFnAttribute("profile-sample-accurate"))) && + "ProfAccForSymsInList should be false when profile-sample-accurate " + "is enabled"); + + // Populating worklist with initial call sites from root inliner, along + // with call site weights. + CandidateQueue CQueue; + InlineCandidate NewCandidate; + for (auto &BB : F) { + for (auto &I : BB.getInstList()) { + auto *CB = dyn_cast(&I); + if (!CB) + continue; + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.push(NewCandidate); + } + } + + // Cap the size growth from profile guided inlining. This is needed even + // though cost of each inline candidate already accounts for callee size, + // because with top-down inlining, we can grow inliner size significantly + // with large number of smaller inlinees each pass the cost check. + assert(ProfileInlineLimitMax >= ProfileInlineLimitMin && + "Max inline size limit should not be smaller than min inline size " + "limit."); + unsigned SizeLimit = F.getInstructionCount() * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + if (ExternalInlineAdvisor) + SizeLimit = std::numeric_limits::max(); + + // Perform iterative BFS call site prioritized inlining + bool Changed = false; + while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { + InlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + CallBase *I = Candidate.CallInstr; + Function *CalledFunction = I->getCalledFunction(); + + if (CalledFunction == &F) + continue; + if (I->isIndirectCall()) { + if (PromotedInsns.count(I)) + continue; + uint64_t Sum; + auto CalleeSamples = findIndirectCallFunctionSamples(*I, Sum); + uint64_t SumOrigin = Sum; + for (const auto *FS : CalleeSamples) { + // TODO: Consider disable pre-lTO ICP for MonoLTO as well + if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { + FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), + PSI->getOrCompHotCountThreshold()); + continue; + } + uint64_t EntryCountDistributed = FS->getEntrySamples(); + // In addition to regular inline cost check, we also need to make sure + // ICP isn't introducing excessive speculative checks even if individual + // target looks beneficial to promote and inline. That means we should + // only do ICP when there's a small number dominant targets. + if (EntryCountDistributed < SumOrigin / ProfileICPThreshold) + break; + // TODO: Fix CallAnalyzer to handle all indirect calls. + // For indirect call, we don't run CallAnalyzer to get InlineCost + // before actual inlining. This is because we could see two different + // types from the same definition, which makes CallAnalyzer choke as + // it's expecting matching parameter type on both caller and callee + // side. See example from PR18962 for the triggering cases (the bug was + // fixed, but we generate different types). + if (!PSI->isHotCount(EntryCountDistributed)) + break; + const char *Reason = nullptr; + auto CalleeFunctionName = FS->getFuncName(); + if (CallBase *DI = tryPromoteIndirectCall( + F, CalleeFunctionName, Sum, EntryCountDistributed, I, Reason)) { + // Attach function profile for promoted indirect callee, and update + // call site count for the promoted inline candidate too. + Candidate = {DI, FS, EntryCountDistributed}; + PromotedInsns.insert(I); + SmallVector InlinedCallSites; + // If profile mismatches, we should not attempt to inline DI. + if ((isa(DI) || isa(DI)) && + tryInlineCandidate(Candidate, InlinedCallSites)) { + for (auto *CB : InlinedCallSites) { + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.emplace(NewCandidate); + } + Changed = true; + } + } else { + LLVM_DEBUG(dbgs() + << "\nFailed to promote indirect call to " + << CalleeFunctionName << " because " << Reason << "\n"); + } + } + } else if (CalledFunction && CalledFunction->getSubprogram() && + !CalledFunction->isDeclaration()) { + SmallVector InlinedCallSites; + if (tryInlineCandidate(Candidate, InlinedCallSites)) { + for (auto *CB : InlinedCallSites) { + if (getInlineCandidate(&NewCandidate, CB)) + CQueue.emplace(NewCandidate); + } + Changed = true; + } + } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { + findCalleeFunctionSamples(*I)->findInlinedFunctions( + InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); + } + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++NumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++NumCSInlinedHitMinLimit; + else + ++NumCSInlinedHitGrowthLimit; + } + + return Changed; +} + /// Find equivalence classes for the given block. /// /// This finds all the blocks that are guaranteed to execute the same @@ -1833,7 +2195,10 @@ } DenseSet InlinedGUIDs; - Changed |= inlineHotFunctions(F, InlinedGUIDs); + if (ProfileIsCS && CallsitePrioritizedInline) + Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); + else + Changed |= inlineHotFunctions(F, InlinedGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); @@ -1978,6 +2343,12 @@ ProfileIsCS = true; FunctionSamples::ProfileIsCS = true; + // Enable priority-base inliner and size inline by default for CSSPGO. + if (!ProfileSizeInline.getNumOccurrences()) + ProfileSizeInline = true; + if (!CallsitePrioritizedInline.getNumOccurrences()) + CallsitePrioritizedInline = true; + // Tracker for profiles under different context ContextTracker = std::make_unique(Reader->getProfiles()); diff --git a/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof b/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/indirect-call-csspgo.prof @@ -0,0 +1,10 @@ +[test]:63067:0 + 1: 3345 _Z3barv:1398 _Z3foov:2059 + 2: 100 _Z3bazv:102 + 3: 100 _Z3zoov:102 +[test:1 @ _Z3barv]:200:100 + 1: 100 +[test:1 @ _Z3foov]:4220:1200 + 14: 4220 +[test:2 @ _Z3bazv]:200:100 + 5: 100 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll copy from llvm/test/Transforms/SampleProfile/profile-context-tracker.ll copy to llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll @@ -1,29 +1,28 @@ -; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly -; based on inline decision, so post inline counts are accurate. - -; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/profile-context-tracker.prof -o %t +; REQUIRES: asserts +; Test that the new FDO inliner using prioty queue will not visit same call site again and again. +; Use debug prints as repeated call site evaluation is not visible from final inline decision. ; Note that we need new pass manager to enable top-down processing for sample profile loader -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi -; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL - -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi -; _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT - +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=OLD-INLINE +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=1 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=NEW-INLINE + +; Old inliner will evaluate the same call site three times +; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi +; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi +; OLD-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; OLD-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi + +; New inliner only evaluate the same call site once +; NEW-INLINE: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; NEW-INLINE-NEXT: Callee context found: main:3.1 @ _Z5funcBi +; NEW-INLINE-NOT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi +; NEW-INLINE-NOT: Callee context found: main:3.1 @ _Z5funcBi @factor = dso_local global i32 3, align 4, !dbg !0 define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { -; INLINE-ALL: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] -; INLINE-HOT: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] entry: br label %for.body, !dbg !25 @@ -34,13 +33,8 @@ %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 -; _Z5funcBi is marked noinline -; INLINE-ALL: call i32 @_Z5funcBi -; INLINE-HOT: call i32 @_Z5funcBi %add = add nuw nsw i32 %x.011, 1, !dbg !31 %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 -; INLINE-ALL-NOT: call i32 @_Z5funcAi -; INLINE-HOT: call i32 @_Z5funcAi %add2 = add i32 %call, %r.010, !dbg !34 %add3 = add i32 %add2, %call1, !dbg !35 %dec = add nsw i32 %x.011, -1, !dbg !36 @@ -49,25 +43,13 @@ } define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 { -; _Z5funcAi is inlined, so outline remainder should have zero counts -; INLINE-ALL: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] entry: %add = add nsw i32 %x, 100000, !dbg !44 -; _Z8funcLeafi is already inlined on main->_Z5funcAi->_Z8funcLeafi, -; so it should not be inlined on _Z5funcAi->_Z8funcLeafi based on updated -; (merged and promoted) context profile -; INLINE-ALL: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45 ret i32 %call, !dbg !46 } define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 { -; main->_Z5funcAi->_Z8funcLeafi is inlined, and _Z5funcBi->_Z8funcLeafi is also -; inlined, so outline remainder should have empty profile -; INLINE-ALL: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] -; INLINE-HOT: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] entry: %cmp = icmp sgt i32 %x, 0, !dbg !57 br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59 @@ -98,29 +80,12 @@ } define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 { -; _Z5funcBi is marked noinline, so outline remainder has promoted context profile -; INLINE-ALL: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] entry: %sub = add nsw i32 %x, -100000, !dbg !51 %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52 -; _Z5funcBi is not inlined into main, so we main->_Z5funcBi->_Z8funcLeafi -; should be inlined based on promoted context profile -; INLINE-ALL-NOT: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi ret i32 %call, !dbg !53 } -; INLINE-ALL-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-ALL-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - -; INLINE-HOT-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-HOT-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 12} -; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - declare i32 @_Z3fibi(i32) attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-icall.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-ALL %s +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/indirect-call-csspgo.prof -sample-profile-icp-threshold=100 -pass-remarks=sample-profile -sample-profile-inline-size=0 -S -o /dev/null 2>&1 | FileCheck -check-prefix=ICP-HOT %s + +define void @test(void ()*) #0 !dbg !3 { +;; Add two direct call to force top-down order for sample profile loader + call void @_Z3foov(), !dbg !7 + call void @_Z3barv(), !dbg !7 + call void @_Z3bazv(), !dbg !7 + %2 = alloca void ()* + store void ()* %0, void ()** %2 + %3 = load void ()*, void ()** %2 + call void %3(), !dbg !4 + %4 = alloca void ()* + store void ()* %0, void ()** %4 + %5 = load void ()*, void ()** %4 + call void %5(), !dbg !5 + ret void +} + +define void @_Z3foov() #0 !dbg !8 { + ret void +} + +define void @_Z3barv() #0 !dbg !9 { + ret void +} + +define void @_Z3bazv() #0 !dbg !10 { + ret void +} + +define void @_Z3zoov() #0 !dbg !11 { + ret void +} + +attributes #0 = {"use-sample-profile"} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1) +!1 = !DIFile(filename: "test.cc", directory: "/") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 3, unit: !0) +!4 = !DILocation(line: 4, scope: !3) +!5 = !DILocation(line: 5, scope: !3) +!6 = !DILocation(line: 6, scope: !3) +!7 = !DILocation(line: 7, scope: !3) +!8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 29, unit: !0) +!9 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 32, unit: !0) +!10 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 24, unit: !0) +!11 = distinct !DISubprogram(name: "zoo", linkageName: "_Z3zoov", scope: !1, file: !1, line: 24, unit: !0) + + +; ICP-ALL: remark: test.cc:5:0: _Z3bazv inlined into test +; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3foov inlined into test +; ICP-ALL-NEXT: remark: test.cc:4:0: _Z3barv inlined into test +; ICP-ALL-NOT: remark + +; ICP-HOT: remark: test.cc:4:0: _Z3foov inlined into test +; ICP-HOT-NOT: remark diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll copy from llvm/test/Transforms/SampleProfile/profile-context-tracker.ll copy to llvm/test/Transforms/SampleProfile/csspgo-inline.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll @@ -1,29 +1,42 @@ -; Test for CSSPGO's SampleContextTracker to make sure context profile tree is promoted and merged properly -; based on inline decision, so post inline counts are accurate. - -; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/profile-context-tracker.prof -o %t +; Test for CSSPGO's new early inliner using priority queue ; Note that we need new pass manager to enable top-down processing for sample profile loader -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile +; Test we inlined the following in top-down order with old inliner ; main:3 @ _Z5funcAi ; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL - -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi -; _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT - +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE +; +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, so we get less inlining for given profile +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW +; +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning hot cutoff can get us the same inlining +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE +; +; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning cold sample profile inline threshold can get us the same inlining +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE +; +; With new FDO early inliner and tuned cutoff, we can control inlining through size growth tuning knob. +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1 +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2 + + +; INLINE-BASE: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10 +; INLINE-BASE: remark: merged.cpp:27:11: _Z8funcLeafi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 @ main:3:10 +; INLINE-BASE: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11 + +; INLINE-NEW: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10 +; INLINE-NEW-NOT: remark + +; INLINE-NEW-LIMIT1-NOT: remark + +; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 +; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11 +; INLINE-NEW-LIMIT2-NOT: remark @factor = dso_local global i32 3, align 4, !dbg !0 define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { -; INLINE-ALL: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] -; INLINE-HOT: @main{{.*}}!prof ![[MAIN_PROF:[0-9]+]] entry: br label %for.body, !dbg !25 @@ -34,13 +47,8 @@ %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 -; _Z5funcBi is marked noinline -; INLINE-ALL: call i32 @_Z5funcBi -; INLINE-HOT: call i32 @_Z5funcBi %add = add nuw nsw i32 %x.011, 1, !dbg !31 %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 -; INLINE-ALL-NOT: call i32 @_Z5funcAi -; INLINE-HOT: call i32 @_Z5funcAi %add2 = add i32 %call, %r.010, !dbg !34 %add3 = add i32 %add2, %call1, !dbg !35 %dec = add nsw i32 %x.011, -1, !dbg !36 @@ -49,25 +57,13 @@ } define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 { -; _Z5funcAi is inlined, so outline remainder should have zero counts -; INLINE-ALL: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcAi{{.*}}!prof ![[FUNCA_PROF:[0-9]+]] entry: %add = add nsw i32 %x, 100000, !dbg !44 -; _Z8funcLeafi is already inlined on main->_Z5funcAi->_Z8funcLeafi, -; so it should not be inlined on _Z5funcAi->_Z8funcLeafi based on updated -; (merged and promoted) context profile -; INLINE-ALL: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45 ret i32 %call, !dbg !46 } define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 { -; main->_Z5funcAi->_Z8funcLeafi is inlined, and _Z5funcBi->_Z8funcLeafi is also -; inlined, so outline remainder should have empty profile -; INLINE-ALL: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] -; INLINE-HOT: @_Z8funcLeafi{{.*}}!prof ![[LEAF_PROF:[0-9]+]] entry: %cmp = icmp sgt i32 %x, 0, !dbg !57 br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59 @@ -98,29 +94,12 @@ } define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 { -; _Z5funcBi is marked noinline, so outline remainder has promoted context profile -; INLINE-ALL: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] -; INLINE-HOT: @_Z5funcBi{{.*}}!prof ![[FUNCB_PROF:[0-9]+]] entry: %sub = add nsw i32 %x, -100000, !dbg !51 %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52 -; _Z5funcBi is not inlined into main, so we main->_Z5funcBi->_Z8funcLeafi -; should be inlined based on promoted context profile -; INLINE-ALL-NOT: call i32 @_Z8funcLeafi -; INLINE-HOT-NOT: call i32 @_Z8funcLeafi ret i32 %call, !dbg !53 } -; INLINE-ALL-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-ALL-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-ALL-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - -; INLINE-HOT-DAG: [[MAIN_PROF]] = !{!"function_entry_count", i64 13} -; INLINE-HOT-DAG: [[FUNCA_PROF]] = !{!"function_entry_count", i64 12} -; INLINE-HOT-DAG-SAME: [[LEAF_PROF]] = !{!"function_entry_count", i64 0} -; INLINE-HOT-DAG: [[FUNCB_PROF]] = !{!"function_entry_count", i64 33} - declare i32 @_Z3fibi(i32) attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll +++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll @@ -3,11 +3,11 @@ ; based on inline decision, so post inline counts are accurate. ; Note that we need new pass manager to enable top-down processing for sample profile loader -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-ALL +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-prioritized-inline=0 -sample-profile-inline-size=0 -debug-only=sample-context-tracker -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-HOT -; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile +; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile ; main:3 @ _Z5funcAi ; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi @@ -20,13 +20,9 @@ ; INLINE-ALL-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi ; INLINE-ALL-NEXT: Callee context found: main:3 @ _Z5funcAi ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi -; INLINE-ALL-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi( -; INLINE-ALL-NEXT: Callee context found: main:3.1 @ _Z5funcBi ; INLINE-ALL-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z8funcLeafi ; INLINE-ALL-NEXT: Callee context found: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; INLINE-ALL-NEXT: Marking context profile as inlined: main:3 @ _Z5funcAi:1 @ _Z8funcLeafi -; INLINE-ALL-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi -; INLINE-ALL-NEXT: Callee context found: main:3.1 @ _Z5funcBi ; INLINE-ALL-NEXT: Getting callee context for instr: %call.i1 = tail call i32 @_Z3fibi ; INLINE-ALL-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi ; INLINE-ALL-NEXT: Getting base profile for function: _Z5funcAi @@ -48,24 +44,23 @@ ; INLINE-ALL-NEXT: Getting base profile for function: _Z8funcLeafi ; INLINE-ALL-NEXT: Merging context profile into base profile: _Z8funcLeafi -; Testwe we inlined the following in top-down order and promot rest not inlined context profile into base profile -; main:3 @ _Z5funcAi +; Test we inlined the following in top-down order and promot rest not inlined context profile into base profile ; _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi ; INLINE-HOT: Getting base profile for function: main ; INLINE-HOT-NEXT: Merging context profile into base profile: main ; INLINE-HOT-NEXT: Found context tree root to promote: external:12 @ main ; INLINE-HOT-NEXT: Context promoted and merged to: main -; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !58 +; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z5funcBi ; INLINE-HOT-NEXT: Callee context found: main:3.1 @ _Z5funcBi -; INLINE-HOT-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !63 +; INLINE-HOT-NEXT: Getting callee context for instr: %call1 = tail call i32 @_Z5funcAi ; INLINE-HOT-NEXT: Callee context found: main:3 @ _Z5funcAi ; INLINE-HOT-NEXT: Getting base profile for function: _Z5funcAi ; INLINE-HOT-NEXT: Merging context profile into base profile: _Z5funcAi ; INLINE-HOT-NEXT: Found context tree root to promote: main:3 @ _Z5funcAi ; INLINE-HOT-NEXT: Context promoted to: _Z5funcAi ; INLINE-HOT-NEXT: Context promoted to: _Z5funcAi:1 @ _Z8funcLeafi -; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50 +; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !50 ; INLINE-HOT-NEXT: Callee context found: _Z5funcAi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcAi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62 @@ -79,11 +74,11 @@ ; INLINE-HOT-NEXT: Context promoted to: _Z5funcBi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Found context tree root to promote: externalA:17 @ _Z5funcBi ; INLINE-HOT-NEXT: Context promoted and merged to: _Z5funcBi -; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !50 +; INLINE-HOT-NEXT: Getting callee context for instr: %call = tail call i32 @_Z8funcLeafi ; INLINE-HOT-NEXT: Callee context found: _Z5funcBi:1 @ _Z8funcLeafi ; INLINE-HOT-NEXT: Marking context profile as inlined: _Z5funcBi:1 @ _Z8funcLeafi -; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi(i32 %tmp.i) #2, !dbg !62 -; INLINE-HOT-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi(i32 %tmp1.i) #2, !dbg !69 +; INLINE-HOT-NEXT: Getting callee context for instr: %call.i = tail call i32 @_Z3fibi +; INLINE-HOT-NEXT: Getting callee context for instr: %call5.i = tail call i32 @_Z3fibi ; INLINE-HOT-NEXT: Getting base profile for function: _Z8funcLeafi ; INLINE-HOT-NEXT: Merging context profile into base profile: _Z8funcLeafi diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll +++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll @@ -4,19 +4,18 @@ ; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/profile-context-tracker.prof -o %t ; Note that we need new pass manager to enable top-down processing for sample profile loader -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile +; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile ; main:3 @ _Z5funcAi ; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL - -; Testwe we inlined the following in top-down order and entry counts accurate reflects post-inline base profile -; main:3 @ _Z5funcAi +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-ALL +; +; Test we inlined the following in top-down order and entry counts accurate reflects post-inline base profile ; _Z5funcAi:1 @ _Z8funcLeafi ; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%t -profile-sample-accurate -S | FileCheck %s --check-prefix=INLINE-HOT @factor = dso_local global i32 3, align 4, !dbg !0 diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-inline.ll @@ -1,8 +1,8 @@ -; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s +; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%S/Inputs/pseudo-probe-inline.prof -S -pass-remarks=sample-profile -sample-profile-prioritized-inline=0 -pass-remarks-output=%t.opt.yaml 2>&1 | FileCheck %s ; RUN: FileCheck %s -check-prefix=YAML < %t.opt.yaml ; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/pseudo-probe-inline.prof -o %t2 -; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s +; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-file=%t2 -S -pass-remarks=sample-profile -sample-profile-prioritized-inline=0 -pass-remarks-output=%t2.opt.yaml 2>&1 | FileCheck %s ; RUN: FileCheck %s -check-prefix=YAML < %t2.opt.yaml target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"