diff --git a/llvm/include/llvm/Transforms/IPO/Inliner.h b/llvm/include/llvm/Transforms/IPO/Inliner.h --- a/llvm/include/llvm/Transforms/IPO/Inliner.h +++ b/llvm/include/llvm/Transforms/IPO/Inliner.h @@ -142,6 +142,29 @@ CGSCCPassManager PM; ModulePassManager MPM; }; + +/// module level inliner +class ModuleInlinerPass : public PassInfoMixin { +public: + ModuleInlinerPass(InlineParams Params = getInlineParams(), + bool MandatoryFirst = false, + InliningAdvisorMode Mode = InliningAdvisorMode::Default, + unsigned MaxDevirtIterations = 0); + ModuleInlinerPass(ModuleInlinerPass &&Arg) = default; + + PreservedAnalyses run(Module &, ModuleAnalysisManager &); + +private: + InlineAdvisor &getAdvisor(const ModuleAnalysisManager &MAM, + FunctionAnalysisManager &FAM, Module &M); + std::unique_ptr OwnedAdvisor; + const bool OnlyMandatory; + + const InlineParams Params; + const InliningAdvisorMode Mode; + const unsigned MaxDevirtIterations; +}; + } // end namespace llvm #endif // LLVM_TRANSFORMS_IPO_INLINER_H diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -115,6 +115,7 @@ MODULE_PASS("dfsan", DataFlowSanitizerPass()) MODULE_PASS("asan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/false, false, true, false)) MODULE_PASS("msan-module", MemorySanitizerPass({})) +MODULE_PASS("module-inline", ModuleInlinerPass()) MODULE_PASS("tsan-module", ThreadSanitizerPass()) MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false)) MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass()) diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -1179,3 +1179,308 @@ // The ModulePassManager has already taken care of invalidating analyses. return PreservedAnalyses::all(); } + +ModuleInlinerPass::ModuleInlinerPass(InlineParams Params, bool OnlyMandatory, + InliningAdvisorMode Mode, + unsigned MaxDevirtIterations) + : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations), + OnlyMandatory(OnlyMandatory) { + // Run the inliner first. The theory is that we are walking bottom-up and so + // the callees have already been fully optimized, and we want to inline them + // into the callers so that our optimizations can reflect that. + // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO + // because it makes profile annotation in the backend inaccurate. + // if (MandatoryFirst) + // PM.addPass(InlinerPass(/*OnlyMandatory*/ true)); + // PM.addPass(InlinerPass()); +} + +InlineAdvisor &ModuleInlinerPass::getAdvisor(const ModuleAnalysisManager &MAM, + FunctionAnalysisManager &FAM, + Module &M) { + if (OwnedAdvisor) + return *OwnedAdvisor; + + auto *IAA = MAM.getCachedResult(M); + if (!IAA) { + // It should still be possible to run the inliner as a stand-alone SCC pass, + // for test scenarios. In that case, we default to the + // DefaultInlineAdvisor, which doesn't need to keep state between SCC pass + // runs. It also uses just the default InlineParams. + // In this case, we need to use the provided FAM, which is valid for the + // duration of the inliner pass, and thus the lifetime of the owned advisor. + // The one we would get from the MAM can be invalidated as a result of the + // inliner's activity. + OwnedAdvisor = + std::make_unique(M, FAM, getInlineParams()); + + // if (!CGSCCInlineReplayFile.empty()) + // OwnedAdvisor = std::make_unique( + // M, FAM, M.getContext(), std::move(OwnedAdvisor), + // CGSCCInlineReplayFile, + // /*EmitRemarks=*/true); + + return *OwnedAdvisor; + } + assert(IAA->getAdvisor() && + "Expected a present InlineAdvisorAnalysis also have an " + "InlineAdvisor initialized"); + return *IAA->getAdvisor(); +} + +static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) { + LibFunc LF; + + // Either this is a normal library function or a "vectorizable" + // function. Not using the VFDatabase here because this query + // is related only to libraries handled via the TLI. + return TLI.getLibFunc(F, LF) || + TLI.isKnownVectorFunctionInLibrary(F.getName()); +} + +PreservedAnalyses ModuleInlinerPass::run(Module &M, + ModuleAnalysisManager &MAM) { + bool Changed = false; + + ProfileSummaryInfo *PSI = MAM.getCachedResult(M); + + FunctionAnalysisManager &FAM = + MAM.getResult(M).getManager(); + + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; + + InlineAdvisor &Advisor = getAdvisor(MAM, FAM, M); + Advisor.onPassEntry(); + + auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); }); + + // We use a single common worklist for calls across the entire SCC. We + // process these in-order and append new calls introduced during inlining to + // the end. The PriorityInlineOrder is optional here, in which the smaller + // callee would have a higher priority to inline. + // + // Note that this particular order of processing is actually critical to + // avoid very bad behaviors. Consider *highly connected* call graphs where + // each function contains a small amount of code and a couple of calls to + // other functions. Because the LLVM inliner is fundamentally a bottom-up + // inliner, it can handle gracefully the fact that these all appear to be + // reasonable inlining candidates as it will flatten things until they become + // too big to inline, and then move on and flatten another batch. + // + // However, when processing call edges *within* an SCC we cannot rely on this + // bottom-up behavior. As a consequence, with heavily connected *SCCs* of + // functions we can end up incrementally inlining N calls into each of + // N functions because each incremental inlining decision looks good and we + // don't have a topological ordering to prevent explosions. + // + // To compensate for this, we don't process transitive edges made immediate + // by inlining until we've done one pass of inlining across the entire SCC. + // Large, highly connected SCCs still lead to some amount of code bloat in + // this model, but it is uniformly spread across all the functions in the SCC + // and eventually they all become too large to inline, rather than + // incrementally maknig a single function grow in a super linear fashion. + std::unique_ptr>> Calls; + if (InlineEnablePriorityOrder) + Calls = std::make_unique(); + else + Calls = std::make_unique>>(); + assert(Calls != nullptr && "Expected an initialized InlineOrder"); + + // Populate the initial list of calls in this SCC. + for (Function &F : M) { + auto &ORE = FAM.getResult(F); + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. + // FIXME: Using instructions sequence is a really bad way to do this. + // Instead we should do an actual RPO walk of the function body. + for (Instruction &I : instructions(F)) + if (auto *CB = dyn_cast(&I)) + if (Function *Callee = CB->getCalledFunction()) { + if (!Callee->isDeclaration()) + Calls->push({CB, -1}); + else if (!isa(I)) { + using namespace ore; + setInlineRemark(*CB, "unavailable definition"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I) + << NV("Callee", Callee) << " will not be inlined into " + << NV("Caller", CB->getCaller()) + << " because its definition is unavailable" + << setIsVerbose(); + }); + } + } + } + if (Calls->empty()) + return PreservedAnalyses::all(); + + // When inlining a callee produces new call sites, we want to keep track of + // the fact that they were inlined from the callee. This allows us to avoid + // infinite inlining in some obscure cases. To represent this, we use an + // index into the InlineHistory vector. + SmallVector, 16> InlineHistory; + + // Track a set vector of inlined callees so that we can augment the caller + // with all of their edges in the call graph before pruning out the ones that + // got simplified away. + SmallSetVector InlinedCallees; + + // Track the dead functions to delete once finished with inlining calls. We + // defer deleting these to make it easier to handle the call graph updates. + SmallVector DeadFunctions; + + // Loop forward over all of the calls. + while (!Calls->empty()) { + // We expect the calls to typically be batched with sequences of calls that + // have the same caller, so we first set up some shared infrastructure for + // this caller. We also do any pruning we can at this layer on the caller + // alone. + Function &F = *Calls->front().first->getCaller(); + + LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n" + << " Function size: " << F.getInstructionCount() + << "\n"); + + auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { + return FAM.getResult(F); + }; + + // Now process as many calls as we have within this caller in the sequence. + // We bail out as soon as the caller has to change so we can update the + // call graph and prepare the context of that new caller. + bool DidInline = false; + while (!Calls->empty() && Calls->front().first->getCaller() == &F) { + auto P = Calls->pop(); + CallBase *CB = P.first; + const int InlineHistoryID = P.second; + Function &Callee = *CB->getCalledFunction(); + + if (InlineHistoryID != -1 && + inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) { + setInlineRemark(*CB, "recursive"); + continue; + } + + auto Advice = Advisor.getAdvice(*CB, OnlyMandatory); + // Check whether we want to inline this callsite. + if (!Advice->isInliningRecommended()) { + Advice->recordUnattemptedInlining(); + continue; + } + + // Setup the data structure used to plumb customization into the + // `InlineFunction` routine. + InlineFunctionInfo IFI( + /*cg=*/nullptr, GetAssumptionCache, PSI, + &FAM.getResult(*(CB->getCaller())), + &FAM.getResult(Callee)); + + InlineResult IR = + InlineFunction(*CB, IFI, &FAM.getResult(*CB->getCaller())); + if (!IR.isSuccess()) { + Advice->recordUnsuccessfulInlining(IR); + continue; + } + + DidInline = true; + InlinedCallees.insert(&Callee); + ++NumInlined; + + LLVM_DEBUG(dbgs() << " Size after inlining: " + << F.getInstructionCount() << "\n"); + + // Add any new callsites to defined functions to the worklist. + if (!IFI.InlinedCallSites.empty()) { + int NewHistoryID = InlineHistory.size(); + InlineHistory.push_back({&Callee, InlineHistoryID}); + + for (CallBase *ICB : reverse(IFI.InlinedCallSites)) { + Function *NewCallee = ICB->getCalledFunction(); + if (!NewCallee) { + // Try to promote an indirect (virtual) call without waiting for + // the post-inline cleanup and the next DevirtSCCRepeatedPass + // iteration because the next iteration may not happen and we may + // miss inlining it. + if (tryPromoteCall(*ICB)) + NewCallee = ICB->getCalledFunction(); + } + if (NewCallee) + if (!NewCallee->isDeclaration()) + Calls->push({ICB, NewHistoryID}); + } + } + + // Merge the attributes based on the inlining. + AttributeFuncs::mergeAttributesForInlining(F, Callee); + + // For local functions, check whether this makes the callee trivially + // dead. In that case, we can drop the body of the function eagerly + // which may reduce the number of callers of other functions to one, + // changing inline cost thresholds. + bool CalleeWasDeleted = false; + if (Callee.hasLocalLinkage()) { + // To check this we also need to nuke any dead constant uses (perhaps + // made dead by this operation on other functions). + Callee.removeDeadConstantUsers(); + // if (Callee.use_empty() && !CG.isLibFunction(Callee)) { + if (Callee.use_empty() && !isKnownLibFunction(Callee, GetTLI(Callee))) { + Calls->erase_if([&](const std::pair &Call) { + return Call.first->getCaller() == &Callee; + }); + // Clear the body and queue the function itself for deletion when we + // finish inlining and call graph updates. + // Note that after this point, it is an error to do anything other + // than use the callee's address or delete it. + Callee.dropAllReferences(); + assert(!is_contained(DeadFunctions, &Callee) && + "Cannot put cause a function to become dead twice!"); + DeadFunctions.push_back(&Callee); + CalleeWasDeleted = true; + } + } + if (CalleeWasDeleted) + Advice->recordInliningWithCalleeDeleted(); + else + Advice->recordInlining(); + } + + if (!DidInline) + continue; + Changed = true; + + InlinedCallees.clear(); + } + + // Now that we've finished inlining all of the calls across this SCC, delete + // all of the trivially dead functions, updating the call graph and the CGSCC + // pass manager in the process. + // + // Note that this walks a pointer set which has non-deterministic order but + // that is OK as all we do is delete things and add pointers to unordered + // sets. + for (Function *DeadF : DeadFunctions) { + // Get the necessary information out of the call graph and nuke the + // function there. Also, clear out any cached analyses. + FAM.clear(*DeadF, DeadF->getName()); + + // And delete the actual function from the module. + // The Advisor may use Function pointers to efficiently index various + // internal maps, e.g. for memoization. Function cleanup passes like + // argument promotion create new functions. It is possible for a new + // function to be allocated at the address of a deleted function. We could + // index using names, but that's inefficient. Alternatively, we let the + // Advisor free the functions when it sees fit. + DeadF->getBasicBlockList().clear(); + M.getFunctionList().remove(DeadF); + + ++NumDeleted; + } + + if (!Changed) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} \ No newline at end of file diff --git a/llvm/test/Transforms/Inline/callbr.ll b/llvm/test/Transforms/Inline/callbr.ll --- a/llvm/test/Transforms/Inline/callbr.ll +++ b/llvm/test/Transforms/Inline/callbr.ll @@ -1,5 +1,6 @@ ; RUN: opt -inline -S < %s | FileCheck %s ; RUN: opt -passes='cgscc(inline)' -S < %s | FileCheck %s +; RUN: opt -passes='module-inline' -S < %s | FileCheck %s define dso_local i32 @main() #0 { %1 = alloca i32, align 4 diff --git a/llvm/test/Transforms/Inline/casts.ll b/llvm/test/Transforms/Inline/casts.ll --- a/llvm/test/Transforms/Inline/casts.ll +++ b/llvm/test/Transforms/Inline/casts.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -inline -S | FileCheck %s ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt < %s -passes='module-inline' -S | FileCheck %s define i32 @testByte(i8 %X) { entry: diff --git a/llvm/test/Transforms/Inline/comdat-ipo.ll b/llvm/test/Transforms/Inline/comdat-ipo.ll --- a/llvm/test/Transforms/Inline/comdat-ipo.ll +++ b/llvm/test/Transforms/Inline/comdat-ipo.ll @@ -1,5 +1,6 @@ ; RUN: opt -inline -S < %s | FileCheck %s ; RUN: opt -passes='cgscc(inline)' -S < %s | FileCheck %s +; RUN: opt -passes='module-inline' -S < %s | FileCheck %s define i32 @caller() { ; CHECK-LABEL: @caller( diff --git a/llvm/test/Transforms/Inline/crash-lifetime-marker.ll b/llvm/test/Transforms/Inline/crash-lifetime-marker.ll --- a/llvm/test/Transforms/Inline/crash-lifetime-marker.ll +++ b/llvm/test/Transforms/Inline/crash-lifetime-marker.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -inline -S | FileCheck %s ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt < %s -passes='module-inline' -S | FileCheck %s ; InlineFunction would assert inside the loop that leaves lifetime markers if ; there was an zero-sized AllocaInst. Check that it doesn't assert and doesn't diff --git a/llvm/test/Transforms/Inline/frameescape.ll b/llvm/test/Transforms/Inline/frameescape.ll --- a/llvm/test/Transforms/Inline/frameescape.ll +++ b/llvm/test/Transforms/Inline/frameescape.ll @@ -1,5 +1,6 @@ ; RUN: opt -inline -S < %s | FileCheck %s ; RUN: opt -passes='cgscc(inline)' -S < %s | FileCheck %s +; RUN: opt -passes='module-inline' -S < %s | FileCheck %s ; PR23216: We can't inline functions using llvm.localescape. diff --git a/llvm/test/Transforms/Inline/inline-assume.ll b/llvm/test/Transforms/Inline/inline-assume.ll --- a/llvm/test/Transforms/Inline/inline-assume.ll +++ b/llvm/test/Transforms/Inline/inline-assume.ll @@ -1,5 +1,6 @@ ; RUN: opt -inline -S -o - < %s | FileCheck %s ; RUN: opt -passes='cgscc(inline)' -S < %s | FileCheck %s +; RUN: opt -passes='module-inline' -S < %s | FileCheck %s %0 = type opaque %struct.Foo = type { i32, %0* } diff --git a/llvm/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll b/llvm/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll --- a/llvm/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll +++ b/llvm/test/Transforms/Inline/inline-constexpr-addrspacecast-argument.ll @@ -1,5 +1,6 @@ ; RUN: opt -S -inline < %s | FileCheck %s ; RUN: opt -S -passes='cgscc(inline)' < %s | FileCheck %s +; RUN: opt -S -passes='module-inline' < %s | FileCheck %s target datalayout = "e-p3:32:32-p4:64:64-n32" diff --git a/llvm/test/Transforms/Inline/inline-fast-math-flags.ll b/llvm/test/Transforms/Inline/inline-fast-math-flags.ll --- a/llvm/test/Transforms/Inline/inline-fast-math-flags.ll +++ b/llvm/test/Transforms/Inline/inline-fast-math-flags.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -S -inline -inline-threshold=20 | FileCheck %s ; RUN: opt < %s -S -passes='cgscc(inline)' -inline-threshold=20 | FileCheck %s +; RUN: opt < %s -S -passes='module-inline' -inline-threshold=20 | FileCheck %s ; Check that we don't drop FastMathFlag when estimating inlining profitability. ; ; In this test we should inline 'foo' to 'boo', because it'll fold to a diff --git a/llvm/test/Transforms/Inline/inline-vla.ll b/llvm/test/Transforms/Inline/inline-vla.ll --- a/llvm/test/Transforms/Inline/inline-vla.ll +++ b/llvm/test/Transforms/Inline/inline-vla.ll @@ -1,5 +1,6 @@ ; RUN: opt -S -inline %s -o - | FileCheck %s ; RUN: opt -S -passes='cgscc(inline)' %s -o - | FileCheck %s +; RUN: opt -S -passes='module-inline' %s -o - | FileCheck %s ; Check that memcpy2 is completely inlined away. ; CHECK-NOT: memcpy2 diff --git a/llvm/test/Transforms/Inline/invoke-cleanup.ll b/llvm/test/Transforms/Inline/invoke-cleanup.ll --- a/llvm/test/Transforms/Inline/invoke-cleanup.ll +++ b/llvm/test/Transforms/Inline/invoke-cleanup.ll @@ -1,5 +1,6 @@ ; RUN: opt %s -inline -S | FileCheck %s ; RUN: opt %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt %s -passes='module-inline' -S | FileCheck %s declare void @external_func() diff --git a/llvm/test/Transforms/Inline/invoke-combine-clauses.ll b/llvm/test/Transforms/Inline/invoke-combine-clauses.ll --- a/llvm/test/Transforms/Inline/invoke-combine-clauses.ll +++ b/llvm/test/Transforms/Inline/invoke-combine-clauses.ll @@ -1,4 +1,5 @@ ; RUN: opt %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt %s -passes='module-inline' -S | FileCheck %s declare void @external_func() declare void @abort() diff --git a/llvm/test/Transforms/Inline/invoke_test-1.ll b/llvm/test/Transforms/Inline/invoke_test-1.ll --- a/llvm/test/Transforms/Inline/invoke_test-1.ll +++ b/llvm/test/Transforms/Inline/invoke_test-1.ll @@ -3,6 +3,7 @@ ; RUN: opt < %s -inline -S | FileCheck %s ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt < %s -passes='module-inline' -S | FileCheck %s declare void @might_throw() diff --git a/llvm/test/Transforms/Inline/invoke_test-3.ll b/llvm/test/Transforms/Inline/invoke_test-3.ll --- a/llvm/test/Transforms/Inline/invoke_test-3.ll +++ b/llvm/test/Transforms/Inline/invoke_test-3.ll @@ -3,6 +3,7 @@ ; RUN: opt < %s -inline -S | FileCheck %s ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt < %s -passes='module-inline' -S | FileCheck %s declare void @might_throw() diff --git a/llvm/test/Transforms/Inline/nested-inline.ll b/llvm/test/Transforms/Inline/nested-inline.ll --- a/llvm/test/Transforms/Inline/nested-inline.ll +++ b/llvm/test/Transforms/Inline/nested-inline.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -inline -S | FileCheck %s ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt < %s -passes='module-inline' -S | FileCheck %s ; Test that bar and bar2 are both inlined throughout and removed. @A = weak global i32 0 ; [#uses=1] @B = weak global i32 0 ; [#uses=1] diff --git a/llvm/test/Transforms/Inline/nonnull.ll b/llvm/test/Transforms/Inline/nonnull.ll --- a/llvm/test/Transforms/Inline/nonnull.ll +++ b/llvm/test/Transforms/Inline/nonnull.ll @@ -1,5 +1,6 @@ ; RUN: opt -S -inline %s | FileCheck %s ; RUN: opt -S -passes='cgscc(inline)' %s | FileCheck %s +; RUN: opt -S -passes='module-inline' %s | FileCheck %s declare void @foo() declare void @bar() diff --git a/llvm/test/Transforms/Inline/pr21206.ll b/llvm/test/Transforms/Inline/pr21206.ll --- a/llvm/test/Transforms/Inline/pr21206.ll +++ b/llvm/test/Transforms/Inline/pr21206.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -inline -S | FileCheck %s ; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s +; RUN: opt < %s -passes='module-inline' -S | FileCheck %s $c = comdat any ; CHECK: $c = comdat any