diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -807,91 +807,20 @@ DenseMap InlineHistoryMap; }; -PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, - CGSCCAnalysisManager &AM, LazyCallGraph &CG, - CGSCCUpdateResult &UR) { - const auto &MAMProxy = - AM.getResult(InitialC, CG); +static bool +doInline(std::unique_ptr>> &Calls, + InlineAdvisor &Advisor, ProfileSummaryInfo *PSI, + FunctionAnalysisManager &FAM, bool OnlyMandatory, bool DoSCCInline, + LazyCallGraph::SCC *C, LazyCallGraph &CG, + SmallVector &DeadFunctions, + function_ref + CheckSCCSplit, + function_ref &InlinedCallees)> + UpdateSCC) { bool Changed = false; - assert(InitialC.size() > 0 && "Cannot handle an empty SCC!"); - Module &M = *InitialC.begin()->getFunction().getParent(); - ProfileSummaryInfo *PSI = MAMProxy.getCachedResult(M); - - FunctionAnalysisManager &FAM = - AM.getResult(InitialC, CG) - .getManager(); - - InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M); - Advisor.onPassEntry(); - - auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); - - // We use a single common worklist for calls across the entire SCC. We - // process these in-order and append new calls introduced during inlining to - // the end. The PriorityInlineOrder is optional here, in which the smaller - // callee would have a higher priority to inline. - // - // Note that this particular order of processing is actually critical to - // avoid very bad behaviors. Consider *highly connected* call graphs where - // each function contains a small amount of code and a couple of calls to - // other functions. Because the LLVM inliner is fundamentally a bottom-up - // inliner, it can handle gracefully the fact that these all appear to be - // reasonable inlining candidates as it will flatten things until they become - // too big to inline, and then move on and flatten another batch. - // - // However, when processing call edges *within* an SCC we cannot rely on this - // bottom-up behavior. As a consequence, with heavily connected *SCCs* of - // functions we can end up incrementally inlining N calls into each of - // N functions because each incremental inlining decision looks good and we - // don't have a topological ordering to prevent explosions. - // - // To compensate for this, we don't process transitive edges made immediate - // by inlining until we've done one pass of inlining across the entire SCC. - // Large, highly connected SCCs still lead to some amount of code bloat in - // this model, but it is uniformly spread across all the functions in the SCC - // and eventually they all become too large to inline, rather than - // incrementally maknig a single function grow in a super linear fashion. - std::unique_ptr>> Calls; - if (InlineEnablePriorityOrder) - Calls = std::make_unique(); - else - Calls = std::make_unique>>(); - assert(Calls != nullptr && "Expected an initialized InlineOrder"); - - // Populate the initial list of calls in this SCC. - for (auto &N : InitialC) { - auto &ORE = - FAM.getResult(N.getFunction()); - // We want to generally process call sites top-down in order for - // simplifications stemming from replacing the call with the returned value - // after inlining to be visible to subsequent inlining decisions. - // FIXME: Using instructions sequence is a really bad way to do this. - // Instead we should do an actual RPO walk of the function body. - for (Instruction &I : instructions(N.getFunction())) - if (auto *CB = dyn_cast(&I)) - if (Function *Callee = CB->getCalledFunction()) { - if (!Callee->isDeclaration()) - Calls->push({CB, -1}); - else if (!isa(I)) { - using namespace ore; - setInlineRemark(*CB, "unavailable definition"); - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) - << NV("Callee", Callee) << " will not be inlined into " - << NV("Caller", CB->getCaller()) - << " because its definition is unavailable" - << setIsVerbose(); - }); - } - } - } - if (Calls->empty()) - return PreservedAnalyses::all(); - - // Capture updatable variable for the current SCC. - auto *C = &InitialC; - // When inlining a callee produces new call sites, we want to keep track of // the fact that they were inlined from the callee. This allows us to avoid // infinite inlining in some obscure cases. To represent this, we use an @@ -903,10 +832,6 @@ // got simplified away. SmallSetVector InlinedCallees; - // Track the dead functions to delete once finished with inlining calls. We - // defer deleting these to make it easier to handle the call graph updates. - SmallVector DeadFunctions; - // Loop forward over all of the calls. while (!Calls->empty()) { // We expect the calls to typically be batched with sequences of calls that @@ -915,7 +840,7 @@ // alone. Function &F = *Calls->front().first->getCaller(); LazyCallGraph::Node &N = *CG.lookup(F); - if (CG.lookupSCC(N) != C) { + if (DoSCCInline && CG.lookupSCC(N) != C) { Calls->pop(); continue; } @@ -949,8 +874,7 @@ // trigger infinite inlining, much like is prevented within the inliner // itself by the InlineHistory above, but spread across CGSCC iterations // and thus hidden from the full inline history. - if (CG.lookupSCC(*CG.lookup(Callee)) == C && - UR.InlinedInternalEdges.count({&N, C})) { + if (DoSCCInline && CheckSCCSplit(Callee, C, N)) { LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node " "previously split out of this SCC by inlining: " << F.getName() << " -> " << Callee.getName() << "\n"); @@ -1050,7 +974,110 @@ // essentially do all of the same things as a function pass and we can // re-use the exact same logic for updating the call graph to reflect the // change. + if (DoSCCInline) + UpdateSCC(C, N, InlinedCallees); + } + + return Changed; +} + +PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, + CGSCCAnalysisManager &AM, LazyCallGraph &CG, + CGSCCUpdateResult &UR) { + const auto &MAMProxy = + AM.getResult(InitialC, CG); + bool Changed = false; + + assert(InitialC.size() > 0 && "Cannot handle an empty SCC!"); + Module &M = *InitialC.begin()->getFunction().getParent(); + ProfileSummaryInfo *PSI = MAMProxy.getCachedResult(M); + + FunctionAnalysisManager &FAM = + AM.getResult(InitialC, CG) + .getManager(); + + InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M); + Advisor.onPassEntry(); + + auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); + // We use a single common worklist for calls across the entire SCC. We + // process these in-order and append new calls introduced during inlining to + // the end. The PriorityInlineOrder is optional here, in which the smaller + // callee would have a higher priority to inline. + // + // Note that this particular order of processing is actually critical to + // avoid very bad behaviors. Consider *highly connected* call graphs where + // each function contains a small amount of code and a couple of calls to + // other functions. Because the LLVM inliner is fundamentally a bottom-up + // inliner, it can handle gracefully the fact that these all appear to be + // reasonable inlining candidates as it will flatten things until they become + // too big to inline, and then move on and flatten another batch. + // + // However, when processing call edges *within* an SCC we cannot rely on this + // bottom-up behavior. As a consequence, with heavily connected *SCCs* of + // functions we can end up incrementally inlining N calls into each of + // N functions because each incremental inlining decision looks good and we + // don't have a topological ordering to prevent explosions. + // + // To compensate for this, we don't process transitive edges made immediate + // by inlining until we've done one pass of inlining across the entire SCC. + // Large, highly connected SCCs still lead to some amount of code bloat in + // this model, but it is uniformly spread across all the functions in the SCC + // and eventually they all become too large to inline, rather than + // incrementally maknig a single function grow in a super linear fashion. + std::unique_ptr>> Calls; + if (InlineEnablePriorityOrder) + Calls = std::make_unique(); + else + Calls = std::make_unique>>(); + assert(Calls != nullptr && "Expected an initialized InlineOrder"); + + // Populate the initial list of calls in this SCC. + for (auto &N : InitialC) { + auto &ORE = + FAM.getResult(N.getFunction()); + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. + // FIXME: Using instructions sequence is a really bad way to do this. + // Instead we should do an actual RPO walk of the function body. + for (Instruction &I : instructions(N.getFunction())) + if (auto *CB = dyn_cast(&I)) + if (Function *Callee = CB->getCalledFunction()) { + if (!Callee->isDeclaration()) + Calls->push({CB, -1}); + else if (!isa(I)) { + using namespace ore; + setInlineRemark(*CB, "unavailable definition"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) + << NV("Callee", Callee) << " will not be inlined into " + << NV("Caller", CB->getCaller()) + << " because its definition is unavailable" + << setIsVerbose(); + }); + } + } + } + if (Calls->empty()) + return PreservedAnalyses::all(); + + // Capture updatable variable for the current SCC. + auto *C = &InitialC; + + // Track the dead functions to delete once finished with inlining calls. We + // defer deleting these to make it easier to handle the call graph updates. + SmallVector DeadFunctions; + + auto CheckSCCSplit = [&](Function &Callee, LazyCallGraph::SCC *C, + LazyCallGraph::Node &N) { + return CG.lookupSCC(*CG.lookup(Callee)) == C && + UR.InlinedInternalEdges.count({&N, C}); + }; + + auto UpdateSCC = [&](LazyCallGraph::SCC *C, LazyCallGraph::Node &N, + SmallSetVector &InlinedCallees) { // Inside the update, we also update the FunctionAnalysisManager in the // proxy for this particular SCC. We do this as the SCC may have changed and // as we're going to mutate this particular function we want to make sure @@ -1088,7 +1115,10 @@ UR.InlinedInternalEdges.insert({&N, OldC}); } InlinedCallees.clear(); - } + }; + + Changed = doInline(Calls, Advisor, PSI, FAM, OnlyMandatory, true, C, CG, + DeadFunctions, CheckSCCSplit, UpdateSCC); // Now that we've finished inlining all of the calls across this SCC, delete // all of the trivially dead functions, updating the call graph and the CGSCC