Index: include/llvm/Analysis/CGSCCPassManager.h =================================================================== --- include/llvm/Analysis/CGSCCPassManager.h +++ include/llvm/Analysis/CGSCCPassManager.h @@ -331,7 +331,9 @@ InvalidSCCSet, nullptr, nullptr}; PreservedAnalyses PA = PreservedAnalyses::all(); - for (LazyCallGraph::RefSCC &InitialRC : CG.postorder_ref_sccs()) { + for (auto RCI = CG.postorder_ref_scc_begin(), + RCE = CG.postorder_ref_scc_end(); + RCI != RCE;) { assert(RCWorklist.empty() && "Should always start with an empty RefSCC worklist"); // The postorder_ref_sccs range we are walking is lazily constructed, so @@ -342,7 +344,10 @@ // to update as the program is simplified and allows us to have greater // cache locality as forming a RefSCC touches all the parts of all the // functions within that RefSCC. - RCWorklist.insert(&InitialRC); + // + // We also eagerly increment the iterator to the next position because + // the CGSCC passes below may delete the current RefSCC. + RCWorklist.insert(&*RCI++); do { LazyCallGraph::RefSCC *RC = RCWorklist.pop_back_val(); @@ -419,6 +424,10 @@ dbgs() << "Re-running SCC passes after a refinement of the " "current SCC: " << *UR.UpdatedC << "\n"; + + // Note that both `C` and `RC` may at this point refer to deleted, + // invalid SCC and RefSCCs respectively. But we will short circuit + // the processing when we check them in the loop above. } while (UR.UpdatedC); } while (!CWorklist.empty()); Index: include/llvm/Transforms/IPO/Inliner.h =================================================================== --- include/llvm/Transforms/IPO/Inliner.h +++ include/llvm/Transforms/IPO/Inliner.h @@ -1,4 +1,4 @@ -//===- InlinerPass.h - Code common to all inliners --------------*- C++ -*-===// +//===- Inliner.h - Inliner pass and infrastructure --------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,19 +6,14 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file defines a simple policy-based bottom-up inliner. This file -// implements all of the boring mechanics of the bottom-up inlining, while the -// subclass determines WHAT to inline, which is the much more interesting -// component. -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_TRANSFORMS_IPO_INLINERPASS_H -#define LLVM_TRANSFORMS_IPO_INLINERPASS_H +#ifndef LLVM_TRANSFORMS_IPO_INLINER_H +#define LLVM_TRANSFORMS_IPO_INLINER_H +#include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h" @@ -29,13 +24,13 @@ class InlineCost; class OptimizationRemarkEmitter; class ProfileSummaryInfo; -template class SmallPtrSet; /// This class contains all of the helper code which is used to perform the -/// inlining operations that do not depend on the policy. -struct Inliner : public CallGraphSCCPass { - explicit Inliner(char &ID); - explicit Inliner(char &ID, bool InsertLifetime); +/// inlining operations that do not depend on the policy. It contains the core +/// bottom-up inlining infrastructure that specific inliner passes use. +struct LegacyInlinerBase : public CallGraphSCCPass { + explicit LegacyInlinerBase(char &ID); + explicit LegacyInlinerBase(char &ID, bool InsertLifetime); /// For this class, we declare that we require and preserve the call graph. /// If the derived class implements this method, it should always explicitly @@ -82,6 +77,32 @@ ImportedFunctionsInliningStatistics ImportedFunctionsStats; }; +/// The inliner pass for the new pass manager. +/// +/// This pass wires together the inlining utilities and the inline cost +/// analysis into a CGSCC pass. It considers every call in every function in +/// the SCC and tries to inline if profitable. It can be tuned with a number of +/// parameters to control what cost model is used and what tradeoffs are made +/// when making the decision. +/// +/// It should be noted that the legacy inliners do considerably more than this +/// inliner pass does. They provide logic for manually merging allocas, and +/// doing considerable DCE including the DCE of dead functions. This pass makes +/// every attempt to be simpler. DCE of functions requires complex reasoning +/// about comdat groups, etc. Instead, it is expected that other more focused +/// passes be composed to achieve the same end result. +class InlinerPass : public PassInfoMixin { +public: + InlinerPass(InlineParams Params = getInlineParams()) + : Params(std::move(Params)) {} + + PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, + LazyCallGraph &CG, CGSCCUpdateResult &UR); + +private: + InlineParams Params; +}; + } // End llvm namespace #endif Index: include/llvm/Transforms/Utils/Cloning.h =================================================================== --- include/llvm/Transforms/Utils/Cloning.h +++ include/llvm/Transforms/Utils/Cloning.h @@ -194,9 +194,17 @@ /// inlined from the callee. This is only filled in if CG is non-null. SmallVector InlinedCalls; + /// All of the new call sites inlined into the caller. + /// + /// 'InlineFunction' fills this in by scanning the inlined instructions, and + /// only if CG is null. If CG is non-null, instead the value handle + /// `InlinedCalls` above is used. + SmallVector InlinedCallSites; + void reset() { StaticAllocas.clear(); InlinedCalls.clear(); + InlinedCallSites.clear(); } }; @@ -210,6 +218,10 @@ /// exists in the instruction stream. Similarly this will inline a recursive /// function by one level. /// +/// Note that while this routine is allowed to cleanup and optimize the +/// *inlined* code to minimize the actual inserted code, it must not delete +/// code in the caller as users of this routine may have pointers to +/// instructions in the caller that need to remain stable. bool InlineFunction(CallInst *C, InlineFunctionInfo &IFI, AAResults *CalleeAAR = nullptr, bool InsertLifetime = true); bool InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -638,7 +638,7 @@ bool HotCallsite = false; uint64_t TotalWeight; - if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) && + if (PSI && CS.getInstruction()->extractProfTotalWeight(TotalWeight) && PSI->isHotCount(TotalWeight)) { HotCallsite = true; } @@ -647,14 +647,14 @@ // when it would increase the threshold and the caller does not need to // minimize its size. bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) || - PSI->isFunctionEntryHot(&Callee); + (PSI && PSI->isFunctionEntryHot(&Callee)); if (InlineHint && !Caller->optForMinSize()) Threshold = MaxIfValid(Threshold, Params.HintThreshold); if (HotCallsite && !Caller->optForMinSize()) Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); - bool ColdCallee = PSI->isFunctionEntryCold(&Callee); + bool ColdCallee = PSI && PSI->isFunctionEntryCold(&Callee); // For cold callees, use the ColdThreshold knob if it is available and reduces // the threshold. if (ColdCallee) Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -72,6 +72,7 @@ #include "llvm/Transforms/IPO/GlobalOpt.h" #include "llvm/Transforms/IPO/GlobalSplit.h" #include "llvm/Transforms/IPO/InferFunctionAttrs.h" +#include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/IPO/LowerTypeTests.h" #include "llvm/Transforms/IPO/PartialInlining.h" Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -87,6 +87,7 @@ #endif CGSCC_PASS("invalidate", InvalidateAllAnalysesPass()) CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass()) +CGSCC_PASS("inline", InlinerPass()) CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass()) #undef CGSCC_PASS Index: lib/Transforms/IPO/AlwaysInliner.cpp =================================================================== --- lib/Transforms/IPO/AlwaysInliner.cpp +++ lib/Transforms/IPO/AlwaysInliner.cpp @@ -26,7 +26,8 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; @@ -62,14 +63,15 @@ /// /// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner /// base class to provide several facilities such as array alloca merging. -class AlwaysInlinerLegacyPass : public Inliner { +class AlwaysInlinerLegacyPass : public LegacyInlinerBase { public: - AlwaysInlinerLegacyPass() : Inliner(ID, /*InsertLifetime*/ true) { + AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) { initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); } - AlwaysInlinerLegacyPass(bool InsertLifetime) : Inliner(ID, InsertLifetime) { + AlwaysInlinerLegacyPass(bool InsertLifetime) + : LegacyInlinerBase(ID, InsertLifetime) { initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); } Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -25,7 +25,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/Transforms/IPO/Inliner.h" using namespace llvm; @@ -38,16 +38,17 @@ /// The common implementation of the inlining logic is shared between this /// inliner pass and the always inliner pass. The two passes use different cost /// analyses to determine when to inline. -class SimpleInliner : public Inliner { +class SimpleInliner : public LegacyInlinerBase { InlineParams Params; public: - SimpleInliner() : Inliner(ID), Params(llvm::getInlineParams()) { + SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } - explicit SimpleInliner(InlineParams Params) : Inliner(ID), Params(Params) { + explicit SimpleInliner(InlineParams Params) + : LegacyInlinerBase(ID), Params(Params) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } @@ -101,10 +102,10 @@ bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { TTIWP = &getAnalysis(); - return Inliner::runOnSCC(SCC); + return LegacyInlinerBase::runOnSCC(SCC); } void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); - Inliner::getAnalysisUsage(AU); + LegacyInlinerBase::getAnalysisUsage(AU); } Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/Inliner.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -26,12 +27,12 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/IPO/InlinerPass.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -76,15 +77,16 @@ cl::Hidden, cl::desc("Enable inliner stats for imported functions")); } // namespace -Inliner::Inliner(char &ID) : CallGraphSCCPass(ID), InsertLifetime(true) {} +LegacyInlinerBase::LegacyInlinerBase(char &ID) + : CallGraphSCCPass(ID), InsertLifetime(true) {} -Inliner::Inliner(char &ID, bool InsertLifetime) +LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime) : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {} /// For this class, we declare that we require and preserve the call graph. /// If the derived class implements this method, it should /// always explicitly call the implementation here. -void Inliner::getAnalysisUsage(AnalysisUsage &AU) const { +void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.addRequired(); AU.addRequired(); @@ -409,13 +411,13 @@ return false; } -bool Inliner::doInitialization(CallGraph &CG) { +bool LegacyInlinerBase::doInitialization(CallGraph &CG) { if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) ImportedFunctionsStats.setModuleInfo(CG.getModule()); return false; // No changes to CallGraph. } -bool Inliner::runOnSCC(CallGraphSCC &SCC) { +bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) { if (skipSCC(SCC)) return false; return inlineCalls(SCC); @@ -630,7 +632,7 @@ return Changed; } -bool Inliner::inlineCalls(CallGraphSCC &SCC) { +bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis().getCallGraph(); ACT = &getAnalysis(); PSI = getAnalysis().getPSI(); @@ -655,7 +657,7 @@ /// Remove now-dead linkonce functions at the end of /// processing to avoid breaking the SCC traversal. -bool Inliner::doFinalization(CallGraph &CG) { +bool LegacyInlinerBase::doFinalization(CallGraph &CG) { if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) ImportedFunctionsStats.dump(InlinerFunctionImportStats == InlinerFunctionImportStatsOpts::Verbose); @@ -663,7 +665,8 @@ } /// Remove dead functions that are not included in DNR (Do Not Remove) list. -bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) { +bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, + bool AlwaysInlineOnly) { SmallVector FunctionsToRemove; SmallVector DeadFunctionsInComdats; SmallDenseMap ComdatEntriesAlive; @@ -765,3 +768,175 @@ } return true; } + +PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, + CGSCCAnalysisManager &AM, LazyCallGraph &CG, + CGSCCUpdateResult &UR) { + FunctionAnalysisManager &FAM = + AM.getResult(InitialC, CG) + .getManager(); + const ModuleAnalysisManager &MAM = + AM.getResult(InitialC, CG).getManager(); + bool Changed = false; + + assert(InitialC.size() > 0 && "Cannot handle an empty SCC!"); + Module &M = *InitialC.begin()->getFunction().getParent(); + ProfileSummaryInfo *PSI = MAM.getCachedResult(M); + + std::function GetAssumptionCache = + [&](Function &F) -> AssumptionCache & { + return FAM.getResult(F); + }; + + // Setup the data structure used to plumb customization into the + // `InlineFunction` routine. + // + // FIXME: We should make the `GetAssumptionCache` below a passed by value + // lambda that is converted to a function_ref instead of the address of + // a std::function. + InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache); + + auto GetInlineCost = [&](CallSite CS) { + Function &Callee = *CS.getCalledFunction(); + auto &CalleeTTI = FAM.getResult(Callee); + return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI); + }; + + // We use a worklist of nodes to process so that we can handle if the SCC + // structure changes and some nodes are no longer part of the current SCC. We + // also need to use an updatable pointer for the SCC as a consequence. + SmallVector Nodes; + for (auto &N : InitialC) + Nodes.push_back(&N); + auto *C = &InitialC; + auto *RC = &C->getOuterRefSCC(); + + // We also use a secondary worklist of calls to allow quickly continuing to + // inline through newly inlined call sites where possible. + SmallVector Calls; + + // Track a set vector of inlined callees so that we can augment the caller + // with all of their edges in the call graph before pruning out the ones that + // got simplified away. + SmallSetVector InlinedCallees; + + // Track a set of dead functions to delete once finished with inlining calls. + // We defer deleting these to make it easier to handle the call graph + // updates. + SmallPtrSet DeadFunctions; + + do { + auto &N = *Nodes.pop_back_val(); + if (CG.lookupSCC(N) != C) + continue; + Function &F = N.getFunction(); + if (F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + + // Get the remarks emission analysis for the caller. + auto &ORE = FAM.getResult(F); + + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. So we + // walk the function backwards and then process the back of the vector. + // FIXME: Using reverse is a really bad way to do this. Instead we should + // do an actual PO walk of the function body. + for (Instruction &I : reverse(instructions(F))) + if (auto CS = CallSite(&I)) + if (Function *Callee = CS.getCalledFunction()) + if (!Callee->isDeclaration()) + Calls.push_back(CS); + + bool DidInline = false; + while (!Calls.empty()) { + CallSite CS = Calls.pop_back_val(); + Function &Callee = *CS.getCalledFunction(); + + // Check whether we want to inline this callsite. + if (!shouldInline(CS, GetInlineCost, ORE)) + continue; + + if (!InlineFunction(CS, IFI)) + continue; + DidInline = true; + InlinedCallees.insert(&Callee); + + // Add any new callsites to defined functions to the worklist. + for (CallSite &CS : reverse(IFI.InlinedCallSites)) + if (Function *NewCallee = CS.getCalledFunction()) + if (!NewCallee->isDeclaration()) + Calls.push_back(CS); + + // For local functions, check whether this makes the callee trivially + // dead. In that case, we can drop the body of the function eagerly + // which may reduce the number of callers of other functions to one, + // changing inline cost thresholds. + if (Callee.hasLocalLinkage()) { + // To check this we also need to nuke any dead constant uses (perhaps + // made dead by this operation on other functions). + Callee.removeDeadConstantUsers(); + if (Callee.use_empty()) + // Note that after this point, it is an error to do anything other + // than use the callee's address or delete it. + Callee.dropAllReferences(); + } + } + + if (!DidInline) + continue; + Changed = true; + + // Add all the inlined callees' edges to the caller. These are by + // definition trivial edges as we already had a transitive call edge to the + // callee. + for (Function *InlinedCallee : InlinedCallees) { + LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee); + for (LazyCallGraph::Edge &E : CalleeN) + if (E.isCall()) + RC->insertTrivialCallEdge(N, *E.getNode()); + else + RC->insertTrivialRefEdge(N, *E.getNode()); + } + + // At this point, since we have made changes we have at least removed + // a call instruction. However, in the process we do some incremental + // simplification of the surrounding code. This simplification can + // essentially do all of the same things as a function pass and we can + // re-use the exact same logic for updating the call graph to reflect the + // change.. + C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); + RC = &C->getOuterRefSCC(); + + // And remember all of the inlined callees that ended up dead after + // inlining. + for (Function *InlinedCallee : InlinedCallees) + if (InlinedCallee->hasLocalLinkage() && InlinedCallee->use_empty()) + DeadFunctions.insert(InlinedCallee); + InlinedCallees.clear(); + } while (!Nodes.empty()); + + // Now that we've finished inlining all of the calls across this SCC, delete + // all of the trivially dead functions, updating the call graph and the CGSCC + // pass manager in the process. + // + // Note that this walks a pointer set which has non-deterministic order but + // that is OK as all we do is delete things and add pointers to unordered + // sets. + for (Function *DeadF : DeadFunctions) { + // Get the necessary information out of the call graph and nuke the + // function there. + auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF)); + auto &DeadRC = DeadC.getOuterRefSCC(); + CG.removeDeadFunction(*DeadF); + + // Mark the relevant parts of the call graph as invalid so we don't visit + // them. + UR.InvalidatedSCCs.insert(&DeadC); + UR.InvalidatedRefSCCs.insert(&DeadRC); + + // And delete the actual function from the module. + M.getFunctionList().erase(DeadF); + } + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} Index: lib/Transforms/Utils/InlineFunction.cpp =================================================================== --- lib/Transforms/Utils/InlineFunction.cpp +++ lib/Transforms/Utils/InlineFunction.cpp @@ -1644,8 +1644,16 @@ } // Update the callgraph if requested. - if (IFI.CG) + if (IFI.CG) { UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI); + } else { + // Otherwise just collect the raw call sites that were inlined. + for (BasicBlock &NewBB : + make_range(FirstNewBlock->getIterator(), Caller->end())) + for (Instruction &I : NewBB) + if (auto CS = CallSite(&I)) + IFI.InlinedCallSites.push_back(CS); + } // For 'nodebug' functions, the associated DISubprogram is always null. // Conservatively avoid propagating the callsite debug location to Index: test/Transforms/Inline/basictest.ll =================================================================== --- test/Transforms/Inline/basictest.ll +++ test/Transforms/Inline/basictest.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -inline -sroa -S | FileCheck %s +; RUN: opt < %s -passes='cgscc(inline,function(sroa))' -S | FileCheck %s target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" define i32 @test1f(i32 %i) { Index: test/Transforms/Inline/cgscc-update.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/cgscc-update.ll @@ -0,0 +1,145 @@ +; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs,inline)' -S | FileCheck %s +; This test runs the inliner and the function attribute deduction. It ensures +; that when the inliner mutates the call graph it correctly updates the CGSCC +; iteration so that we can compute refined function attributes. In this way it +; is leveraging function attribute computation to observe correct call graph +; updates. + +; Boring unknown external function call. +; CHECK: declare void @unknown() +declare void @unknown() + +; Sanity check: this should get annotated as readnone. +; CHECK: Function Attrs: readnone +; CHECK-NEXT: declare void @readnone() +declare void @readnone() readnone + +; The 'test1_' prefixed functions are designed to trigger forming a new direct +; call in the inlined body of the function. After that, we form a new SCC and +; using that can deduce precise function attrs. + +; This function should no longer exist. +; CHECK-NOT: @test1_f() +define internal void @test1_f(void()* %p) { +entry: + call void %p() + ret void +} + +; This function should have had 'readnone' deduced for its SCC. +; CHECK: Function Attrs: noinline readnone +; CHECK-NEXT: define void @test1_g() +define void @test1_g() noinline { +entry: + call void @test1_f(void()* @test1_h) + ret void +} + +; This function should have had 'readnone' deduced for its SCC. +; CHECK: Function Attrs: noinline readnone +; CHECK-NEXT: define void @test1_h() +define void @test1_h() noinline { +entry: + call void @test1_g() + call void @readnone() + ret void +} + + +; The 'test2_' prefixed functions are designed to trigger forming a new direct +; call due to RAUW-ing the returned value of a called function into the caller. +; This too should form a new SCC which can then be reasoned about to compute +; precise function attrs. + +; This function should no longer exist. +; CHECK-NOT: @test2_f() +define internal void()* @test2_f() { +entry: + ret void()* @test2_h +} + +; This function should have had 'readnone' deduced for its SCC. +; CHECK: Function Attrs: noinline readnone +; CHECK-NEXT: define void @test2_g() +define void @test2_g() noinline { +entry: + %p = call void()* @test2_f() + call void %p() + ret void +} + +; This function should have had 'readnone' deduced for its SCC. +; CHECK: Function Attrs: noinline readnone +; CHECK-NEXT: define void @test2_h() +define void @test2_h() noinline { +entry: + call void @test2_g() + call void @readnone() + ret void +} + + +; The 'test3_' prefixed functions are designed to inline in a way that causes +; call sites to become trivially dead during the middle of inlining callsites of +; a single function to make sure that the inliner does not get confused by this +; pattern. + +; CHECK-NOT: @test3_maybe_unknown( +define internal void @test3_maybe_unknown(i1 %b) { +entry: + br i1 %b, label %then, label %exit + +then: + call void @unknown() + br label %exit + +exit: + ret void +} + +; CHECK-NOT: @test3_f( +define internal i1 @test3_f() { +entry: + ret i1 false +} + +; CHECK-NOT: @test3_g( +define internal i1 @test3_g(i1 %b) { +entry: + br i1 %b, label %then1, label %if2 + +then1: + call void @test3_maybe_unknown(i1 true) + br label %if2 + +if2: + %f = call i1 @test3_f() + br i1 %f, label %then2, label %exit + +then2: + call void @test3_maybe_unknown(i1 true) + br label %exit + +exit: + ret i1 false +} + +; FIXME: Currently the inliner doesn't successfully mark this as readnone +; because while it simplifies trivially dead CFGs when inlining callees it +; doesn't simplify the caller's trivially dead CFG and so we end with a dead +; block calling @unknown. +; CHECK-NOT: Function Attrs: readnone +; CHECK: define void @test3_h() +define void @test3_h() { +entry: + %g = call i1 @test3_g(i1 false) + br i1 %g, label %then, label %exit + +then: + call void @test3_maybe_unknown(i1 true) + br label %exit + +exit: + call void @test3_maybe_unknown(i1 false) + ret void +} Index: test/Transforms/Inline/last-callsite.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/last-callsite.ll @@ -0,0 +1,270 @@ +; RUN: opt < %s -passes='cgscc(inline)' -inline-threshold=0 -S | FileCheck %s + +; The 'test1_' prefixed functions test the basic 'last callsite' inline +; threshold adjustment where we specifically inline the last call site of an +; internal function regardless of cost. + +define internal void @test1_f() { +entry: + %p = alloca i32 + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + ret void +} + +; Identical to @test1_f but doesn't get inlined because there is more than one +; call. If this *does* get inlined, the body used both here and in @test1_f +; isn't a good test for different threshold based on the last call. +define internal void @test1_g() { +entry: + %p = alloca i32 + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + ret void +} + +define void @test1() { +; CHECK-LABEL: define void @test1() +entry: + call void @test1_f() +; CHECK-NOT: @test1_f + + call void @test1_g() + call void @test1_g() +; CHECK: call void @test1_g() +; CHECK: call void @test1_g() + + ret void +} + + +; The 'test2_' prefixed functions test that we can discover the last callsite +; bonus after having inlined the prior call site. For this to to work, we need +; a callsite dependent cost so we have a trivial predicate guarding all the +; cost, and set that in a particular direction. + +define internal void @test2_f(i1 %b) { +entry: + %p = alloca i32 + br i1 %b, label %then, label %exit + +then: + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + br label %exit + +exit: + ret void +} + +; Identical to @test2_f but doesn't get inlined because there is more than one +; call. If this *does* get inlined, the body used both here and in @test2_f +; isn't a good test for different threshold based on the last call. +define internal void @test2_g(i1 %b) { +entry: + %p = alloca i32 + br i1 %b, label %then, label %exit + +then: + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + br label %exit + +exit: + ret void +} + +define void @test2() { +; CHECK-LABEL: define void @test2() +entry: + ; The first call is trivial to inline due to the argument. + call void @test2_f(i1 false) +; CHECK-NOT: @test2_f + + ; The second call is too expensive to inline unless we update the number of + ; calls after inlining the second. + call void @test2_f(i1 true) +; CHECK-NOT: @test2_f + + ; Sanity check that two calls with the hard predicate remain uninlined. + call void @test2_g(i1 true) + call void @test2_g(i1 true) +; CHECK: call void @test2_g(i1 true) +; CHECK: call void @test2_g(i1 true) + + ret void +} + + +; The 'test3_' prefixed functions are similar to the 'test2_' functions but the +; relative order of the trivial and hard to inline callsites is reversed. This +; checks that the order of calls isn't significant to whether we observe the +; "last callsite" threshold difference because the next-to-last gets inlined. +; FIXME: We don't currently catch this case. + +define internal void @test3_f(i1 %b) { +entry: + %p = alloca i32 + br i1 %b, label %then, label %exit + +then: + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + br label %exit + +exit: + ret void +} + +; Identical to @test3_f but doesn't get inlined because there is more than one +; call. If this *does* get inlined, the body used both here and in @test3_f +; isn't a good test for different threshold based on the last call. +define internal void @test3_g(i1 %b) { +entry: + %p = alloca i32 + br i1 %b, label %then, label %exit + +then: + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + br label %exit + +exit: + ret void +} + +define void @test3() { +; CHECK-LABEL: define void @test3() +entry: + ; The first call is too expensive to inline unless we update the number of + ; calls after inlining the second. + call void @test3_f(i1 true) +; FIXME: We should inline this call without iteration. +; CHECK: call void @test3_f(i1 true) + + ; But the second call is trivial to inline due to the argument. + call void @test3_f(i1 false) +; CHECK-NOT: @test3_f + + ; Sanity check that two calls with the hard predicate remain uninlined. + call void @test3_g(i1 true) + call void @test3_g(i1 true) +; CHECK: call void @test3_g(i1 true) +; CHECK: call void @test3_g(i1 true) + + ret void +} + + +; The 'test4_' prefixed functions test that we can discover the last callsite +; bonus after having inlined the prior call site. For this to to work, we need +; a callsite dependent cost so we have a trivial predicate guarding all the +; cost, and set that in a particular direction. + +define internal void @test4_f(i1 %b) { +entry: + %p = alloca i32 + br i1 %b, label %then, label %exit + +then: + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + br label %exit + +exit: + ret void +} + +; Identical to @test4_f but doesn't get inlined because there is more than one +; call. If this *does* get inlined, the body used both here and in @test4_f +; isn't a good test for different threshold based on the last call. +define internal void @test4_g(i1 %b) { +entry: + %p = alloca i32 + br i1 %b, label %then, label %exit + +then: + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + store volatile i32 0, i32* %p + br label %exit + +exit: + ret void +} + +define void @test4() { +; CHECK-LABEL: define void @test4() +entry: + ; The first call is trivial to inline due to the argument. However this + ; argument also carefully uses the function being called as part of a complex + ; constant expression. Merely inlining and deleting the call isn't enough to + ; drop the use count here, we need to GC the dead constant expression as + ; well. + call void @test4_f(i1 icmp ne (i64 ptrtoint (void (i1)* @test4_f to i64), i64 ptrtoint(void (i1)* @test4_f to i64))) +; CHECK-NOT: @test4_f + + ; The second call is too expensive to inline unless we update the number of + ; calls after inlining the second. + call void @test4_f(i1 true) +; CHECK-NOT: @test4_f + + ; And check that a single call to a function which is used by a complex + ; constant expression cannot be inlined because the constant expression forms + ; a second use. If this part starts failing we need to use more complex + ; constant expressions to reference a particular function with them. + %sink = alloca i1 + store volatile i1 icmp ne (i64 ptrtoint (void (i1)* @test4_g to i64), i64 ptrtoint(void (i1)* @test4_g to i64)), i1* %sink + call void @test4_g(i1 true) +; CHECK: store volatile i1 false +; CHECK: call void @test4_g(i1 true) + + ret void +} Index: test/Transforms/Inline/nested-inline.ll =================================================================== --- test/Transforms/Inline/nested-inline.ll +++ test/Transforms/Inline/nested-inline.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -inline -S | FileCheck %s +; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s ; Test that bar and bar2 are both inlined throughout and removed. @A = weak global i32 0 ; [#uses=1] @B = weak global i32 0 ; [#uses=1]