Index: include/llvm/Transforms/IPO/InlinerPass.h =================================================================== --- include/llvm/Transforms/IPO/InlinerPass.h +++ include/llvm/Transforms/IPO/InlinerPass.h @@ -57,7 +57,18 @@ /// given on the comand line. It is higher if the callee is marked with the /// inlinehint attribute. /// - unsigned getInlineThreshold(CallSite CS) const; + unsigned getInlineThreshold(CallSite CS); + + /// The inline threshold can be dynamically tuned according to the control + /// flow context around the callsite. + virtual unsigned TuneThreshold(unsigned threshold, CallSite CS) { + return threshold; + }; + + /// Analyze callee by calculating loop info and others etc. + virtual void + AnalyzeFunc(Function *F, + SmallVector, 16> &CallSites){}; /// getInlineCost - This method must be implemented by the subclass to /// determine the cost of inlining the specified call site. If the cost @@ -83,7 +94,10 @@ /// shouldInline - Return true if the inliner should attempt to /// inline at the given CallSite. - bool shouldInline(CallSite CS); + /// Set ABC true, if the A->B->C case can pass the inlining threshold check. + /// and false otherwise. + /// Pass true to Remark if want to emit analysis message. + bool shouldInline(CallSite CS, bool &ABC, bool Remark); }; } // End llvm namespace Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -12,25 +12,86 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/IPO/InlinerPass.h" using namespace llvm; #define DEBUG_TYPE "inline" +STATISTIC(NumCallsAnalyzedForLoopInfo, "Number of call analyzed for loop info"); + +static cl::opt + AgressiveInline("inline-aggressive", cl::Hidden, cl::Optional, + cl::desc("Enable aggresive inline or not")); + +static cl::opt + SmallLoopSize("inline-small-loop-size", cl::Hidden, cl::init(2), + cl::ZeroOrMore, + cl::desc("The maximum number of BBs in a small loop")); + +static cl::opt SmallLoopBonus( + "inline-small-loop-bonus", cl::Hidden, cl::init(400), cl::ZeroOrMore, + cl::desc("The percent of threshold enlarged for a simple loop")); + +static cl::opt + LoopBonus("inline-loop-bonus", cl::Hidden, cl::init(200), cl::ZeroOrMore, + cl::desc("The percent of threshold enlarged for a loop")); + +static cl::opt + CalleeNum("inline-callee-num", cl::Hidden, cl::init(10), cl::ZeroOrMore, + cl::desc("The maximum number of callees inside caller")); + +static cl::opt ConstArgBonus( + "inline-const-arg-bonus", cl::Hidden, cl::init(200), cl::ZeroOrMore, + cl::desc("The percent of threshold enlarged for a callee with const args")); + namespace { +/// A small loop analyzer to help iniling context analysis. +class LoopAnalyzer { +public: + LoopAnalyzer() {} + ~LoopAnalyzer(); + + /// Analyze loops of function F, and record result to cache LoopStatusMap. + /// Always get loop info updated if update is true. + void Analyze(Function *F, bool update); + + /// Return the depth of loop which instruction I belongs to. + unsigned GetLoopDepth(Instruction *I); + + /// Return true if function F has loop defined, and false otherwise. + bool HasLoop(Function *F); + + /// Return the loop to which instruction I belongs. + Loop *GetLoop(Instruction *I); + +private: + struct LoopS { + DominatorTree DT; + LoopInfoBase LI; + }; + + /// The cache of loop info status for any function. + DenseMap LoopStatusMap; +}; + /// \brief Actual inliner pass implementation. /// /// The common implementation of the inlining logic is shared between this @@ -39,13 +100,29 @@ class SimpleInliner : public Inliner { InlineCostAnalysis *ICA; + /// A loop analyzer to analyze/record loops for all functions in current SCC. + LoopAnalyzer LA; + + /// Optimization level like -O1, -O2, -O3, ... + unsigned OptLevel; + + /// Record all call sites passed by inliner. + SmallVector, 16> *CallSites; + public: - SimpleInliner() : Inliner(ID), ICA(nullptr) { + SimpleInliner() : Inliner(ID), ICA(nullptr), OptLevel(0), CallSites(0) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } SimpleInliner(int Threshold) - : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr) { + : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr), + OptLevel(0), CallSites(0) { + initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); + } + + SimpleInliner(int Threshold, unsigned OptLevel) + : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(nullptr), + OptLevel(OptLevel), CallSites(0) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } @@ -57,12 +134,20 @@ bool runOnSCC(CallGraphSCC &SCC) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Use the call site context to tune the original threshold and return. + unsigned TuneThreshold(unsigned thres, CallSite CS) override; + + /// Analyze callee by calculating loop info and others etc. + void + AnalyzeFunc(Function *F, + SmallVector, 16> &CallSites) override; }; static int computeThresholdFromOptLevels(unsigned OptLevel, unsigned SizeOptLevel) { if (OptLevel > 2) - return 275; + return 250; if (SizeOptLevel == 1) // -Os return 75; if (SizeOptLevel == 2) // -Oz @@ -91,7 +176,67 @@ Pass *llvm::createFunctionInliningPass(unsigned OptLevel, unsigned SizeOptLevel) { return new SimpleInliner( - computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); + computeThresholdFromOptLevels(OptLevel, SizeOptLevel), OptLevel); +} + +LoopAnalyzer::~LoopAnalyzer() { + for (DenseMap::const_iterator I = LoopStatusMap.begin(), + E = LoopStatusMap.end(); + I != E; ++I) { + const LoopS *LS = I->second; + assert(LS && "Find loop analyzer doesn't exist for function!"); + delete LS; + } +} + +/// Analyze loops of function F. +/// Do nothing if update is false and F already has loop information. +void LoopAnalyzer::Analyze(Function *F, bool update) { + LoopS *LS; + + if (LoopStatusMap.count(F) && !update) + return; + + if (LoopStatusMap.count(F) && update) + delete LoopStatusMap[F]; + + LS = new LoopS(); + LoopStatusMap[F] = LS; + LS->DT.recalculate(*F); + LS->LI.Analyze(LS->DT); + + ++NumCallsAnalyzedForLoopInfo; +} + +/// Return the loop instruction I belongs to. +Loop *LoopAnalyzer::GetLoop(Instruction *I) { + Function *F = I->getParent()->getParent(); + + // F could be caller's caller, and if we never analyze it before, we must + // analyze it to get loop information. If we already have F's loop info, + // it's unnecessary to get it updated at this moment, because F must have + // not changed since last inlining. We only update loop info whenever + // inlining really happens. + Analyze(F, false); + + const LoopS *LS = LoopStatusMap.find(F)->second; + return LS->LI.getLoopFor(I->getParent()); +} + +/// Return the depth of loop to which instruction I belongs. +unsigned LoopAnalyzer::GetLoopDepth(Instruction *I) { + Function *F = I->getParent()->getParent(); + + Analyze(F, false); + const LoopS *LS = LoopStatusMap.find(F)->second; + return LS->LI.getLoopDepth(I->getParent()); +} + +/// Return true if function F has loop defined, and false otherwise. +bool LoopAnalyzer::HasLoop(Function *F) { + Analyze(F, false); + const LoopS *LS = LoopStatusMap.find(F)->second; + return !LS->LI.empty(); } bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { @@ -103,3 +248,86 @@ AU.addRequired(); Inliner::getAnalysisUsage(AU); } + +/// The inline threshold can be dynamically tuned according to the control +/// flow context around the callsite. +unsigned SimpleInliner::TuneThreshold(unsigned thres, CallSite CS) { + // We only tune threshold if optlevel is larger than 2 or aggressive inlining + // is specified on command line. + if (!AgressiveInline && OptLevel <= 2) + return thres; + + Function *Caller = CS.getCaller(); + Function *Callee = CS.getCalledFunction(); + StringRef CalleeName = Callee->getName(); + + // The following are constant factors to tune threshold. + + // The maximum number of callees inside caller. + const unsigned MaxNumOfCallee = CalleeNum; + // The maximum number of BBs in a simple loop. + const unsigned MaxNumOfBlocksInSimpleLoop = SmallLoopSize; + // The percent of threshold enlarged for a simple loop. + const unsigned SimpleLoopThresholdBonus = SmallLoopBonus; + // The percent of threshold enlarged for a loop. + const unsigned LoopThresholdBonus = LoopBonus; + // The times we will enlarge for callees with constant arguments. + const unsigned ConstantArgThresholdBonus = ConstArgBonus; + + // Some heuristic rules are defined as below... + + // Avoid inlining too many times of the same callee to reduce code size. + unsigned NumOfCallee = 0; + assert(CallSites && "Call sites don't exist!"); + for (unsigned CSi = 0; CSi != CallSites->size(); ++CSi) { + CallSite CS = (*CallSites)[CSi].first; + + if (Caller != CS.getCaller() || Callee != CS.getCalledFunction()) + continue; + + NumOfCallee++; + } + if (NumOfCallee > MaxNumOfCallee) { + DEBUG(dbgs() << " > " << CalleeName << " is called " << NumOfCallee + << " times, Skip.\n"); + return thres; + } + + // Give threshold bonus if callee is inside a loop body. + unsigned LoopDepth = LA.GetLoopDepth(CS.getInstruction()); + if (LoopDepth) { + Loop *l = LA.GetLoop(CS.getInstruction()); + + // Give further threshold bonus if the loop body is very small. + if (l->getNumBlocks() <= MaxNumOfBlocksInSimpleLoop) { + DEBUG(dbgs() << " > " << l->getNumBlocks() << " BBs found in loop.\n"); + return thres * SimpleLoopThresholdBonus / 100; + } else { + DEBUG(dbgs() << " > " << CalleeName << " is called in loop.\n"); + return thres * LoopThresholdBonus / 100; + } + } + + // Give threshold bonus, if callee has constant real args and loop. + // This is because constant can significantly affect loop optimization. + // e.g. loop boundary could turn to be constant after inlining. + bool FindConstArg = false; + for (CallSite::arg_iterator CAI = CS.arg_begin(), CAE = CS.arg_end(); + CAI != CAE; ++CAI) { + if (dyn_cast(CAI)) + FindConstArg = true; + } + if (FindConstArg && LA.HasLoop(Callee)) { + DEBUG(dbgs() << " > " << CalleeName << " has const arg.\n"); + return thres * ConstantArgThresholdBonus / 100; + } + + // TODO: add more heuristic rules here. + return thres; +} + +void SimpleInliner::AnalyzeFunc( + Function *F, SmallVector, 16> &CallSites) { + this->CallSites = &CallSites; + LA.Analyze(F, true); +} Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/InlinerPass.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -251,7 +252,7 @@ return true; } -unsigned Inliner::getInlineThreshold(CallSite CS) const { +unsigned Inliner::getInlineThreshold(CallSite CS) { int thres = InlineThreshold; // -inline-threshold or else selected by // overall opt level @@ -284,7 +285,8 @@ ColdThreshold < thres) thres = ColdThreshold; - return thres; + // We need to tune threshold according to the control flow context. + return TuneThreshold(thres, CS); } static void emitAnalysis(CallSite CS, const Twine &Msg) { @@ -295,34 +297,45 @@ } /// Return true if the inliner should attempt to inline at the given CallSite. -bool Inliner::shouldInline(CallSite CS) { +/// Set ABC true, if the A->B->C case can pass the inlining threshold check. +/// and false otherwise. +/// Pass true to Remark if want to emit analysis message. +bool Inliner::shouldInline(CallSite CS, bool &ABC, bool Remark) { + ABC = false; + InlineCost IC = getInlineCost(CS); if (IC.isAlways()) { - DEBUG(dbgs() << " Inlining: cost=always" - << ", Call: " << *CS.getInstruction() << "\n"); - emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) + - " should always be inlined (cost=always)"); + if (Remark) { + DEBUG(dbgs() << " Inlining: cost=always" + << ", Call: " << *CS.getInstruction() << "\n"); + emitAnalysis(CS, Twine(CS.getCalledFunction()->getName()) + + " should always be inlined (cost=always)"); + } return true; } if (IC.isNever()) { - DEBUG(dbgs() << " NOT Inlining: cost=never" - << ", Call: " << *CS.getInstruction() << "\n"); - emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() + - " should never be inlined (cost=never)")); + if (Remark) { + DEBUG(dbgs() << " NOT Inlining: cost=never" + << ", Call: " << *CS.getInstruction() << "\n"); + emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() + + " should never be inlined (cost=never)")); + } return false; } Function *Caller = CS.getCaller(); if (!IC) { - DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() - << ", thres=" << (IC.getCostDelta() + IC.getCost()) - << ", Call: " << *CS.getInstruction() << "\n"); - emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() + - " too costly to inline (cost=") + - Twine(IC.getCost()) + ", threshold=" + - Twine(IC.getCostDelta() + IC.getCost()) + ")"); + if (Remark) { + DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() + << ", thres=" << (IC.getCostDelta() + IC.getCost()) + << ", Call: " << *CS.getInstruction() << "\n"); + emitAnalysis(CS, Twine(CS.getCalledFunction()->getName() + + " too costly to inline (cost=") + + Twine(IC.getCost()) + ", threshold=" + + Twine(IC.getCostDelta() + IC.getCost()) + ")"); + } return false; } @@ -386,25 +399,31 @@ TotalSecondaryCost += InlineConstants::LastCallToStaticBonus; if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) { - DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << - " Cost = " << IC.getCost() << - ", outer Cost = " << TotalSecondaryCost << '\n'); - emitAnalysis( - CS, Twine("Not inlining. Cost of inlining " + - CS.getCalledFunction()->getName() + - " increases the cost of inlining " + - CS.getCaller()->getName() + " in other contexts")); + if (Remark) { + DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() + << " Cost = " << IC.getCost() + << ", outer Cost = " << TotalSecondaryCost << '\n'); + emitAnalysis(CS, + Twine("Not inlining. Cost of inlining " + + CS.getCalledFunction()->getName() + + " increases the cost of inlining " + + CS.getCaller()->getName() + " in other contexts")); + } + ABC = true; return false; } } - DEBUG(dbgs() << " Inlining: cost=" << IC.getCost() - << ", thres=" << (IC.getCostDelta() + IC.getCost()) - << ", Call: " << *CS.getInstruction() << '\n'); - emitAnalysis( - CS, CS.getCalledFunction()->getName() + Twine(" can be inlined into ") + - CS.getCaller()->getName() + " with cost=" + Twine(IC.getCost()) + - " (threshold=" + Twine(IC.getCostDelta() + IC.getCost()) + ")"); + if (Remark) { + DEBUG(dbgs() << " Inlining: cost=" << IC.getCost() + << ", thres=" << (IC.getCostDelta() + IC.getCost()) + << ", Call: " << *CS.getInstruction() << '\n'); + emitAnalysis(CS, CS.getCalledFunction()->getName() + + Twine(" can be inlined into ") + + CS.getCaller()->getName() + " with cost=" + + Twine(IC.getCost()) + " (threshold=" + + Twine(IC.getCostDelta() + IC.getCost()) + ")"); + } return true; } @@ -488,12 +507,76 @@ InlinedArrayAllocasTy InlinedArrayAllocas; InlineFunctionInfo InlineInfo(&CG, AA, ACT); + // Solve A->B->C issue (ABC case in short). We check all call sites once + // before doing real inlining transformation. If any positive A->B->C case is + // captured, we will exit the analysis of current function immediately. This + // is because C still can be inlined into A after B is inlined into A. And + // other call sites like B->X, for which X can be inlined into B but A->B->X + // situation is negative could still have inlining opportunity after B is + // inlined into function A. The early exit for ABC case can significantly + // save compile-time for CPP program because all the analysis of late-called + // callees can be saved. + // + // FIXME: This may not be accurate enough because the early-called function + // inlining could simplify the argument of of late-called function. Therefore + // some positive A->B->C cases could be missed eventually. + + // Collect all callers and callsites for ABC case checking. + SmallSetVector Callers; + SmallSetVector CallSitesForABC; + for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { + CallSite CS = CallSites[CSi].first; + Function *Caller = CS.getCaller(); + Function *Callee = CS.getCalledFunction(); + + // If this call site is dead and it is to a readonly function, we should + // just delete the call instead of trying to inline it, regardless of + // size. This happens because IPSCCP propagates the result out of the + // call and then we're left with the dead call. + if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) + continue; + + // We can only inline direct calls to non-declarations. + if (!Callee || Callee->isDeclaration()) + continue; + + // If this call site was obtained by inlining another function, verify + // that the include path for the function did not include the callee + // itself. If so, we'd be recursively inlining the same function, + // which would provide the same callsites, which would cause us to + // infinitely inline. + int InlineHistoryID = CallSites[CSi].second; + if (InlineHistoryID != -1 && + InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) + continue; + + Callers.insert(Caller); + CallSitesForABC.insert(CS); + } + + // Analyze all callers once. + for (unsigned i = 0; i < Callers.size(); i++) + AnalyzeFunc(Callers[i], CallSites); + + // If positive ABC case is found out of callsites, we exit immediately. + bool ABC = false; + + for (unsigned i = 0; i < CallSitesForABC.size(); i++) + if (!shouldInline(CallSitesForABC[i], ABC, false)) + if (ABC) + return false; + // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. bool Changed = false; bool LocalChange; + Function *LastCaller = 0; + CallSite LastCS; do { LocalChange = false; + if (CallSites.size()) + LastCS = CallSites[CallSites.size() - 1].first; + // Iterate over the outer loop because inlining functions can cause indirect // calls to become direct calls. for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { @@ -532,9 +615,26 @@ // Get DebugLoc to report. CS will be invalid after Inliner. DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + if (LastCaller != Caller) { + // We don't analyze loop info until caller is changed. The inlining + // of any call site should not significantly change the loop info for + // other call sites in the same caller. This intends to save the + // number of analyzing loop info, because it is expensive. + AnalyzeFunc(Caller, CallSites); + LastCaller = Caller; + } else if (LastCS == CS) { + // If we reach the end of the analysis of the same level of call + // sites, the newly introduced call sites after inlining as the + // next level would have to be analyzed to get accurate loop info. + LastCaller = 0; + + // Mark the end of next level of call sites. + LastCS = CallSites[CallSites.size() - 1].first; + } + // If the policy determines that we should inline this function, // try to do so. - if (!shouldInline(CS)) { + if (!shouldInline(CS, ABC, true)) { emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, Twine(Callee->getName() + " will not be inlined into " + @@ -596,16 +696,8 @@ ++NumDeleted; } - // Remove this call site from the list. If possible, use - // swap/pop_back for efficiency, but do not use it if doing so would - // move a call site to a function in this SCC before the - // 'FirstCallInSCC' barrier. - if (SCC.isSingular()) { - CallSites[CSi] = CallSites.back(); - CallSites.pop_back(); - } else { - CallSites.erase(CallSites.begin()+CSi); - } + // Remove this call site from the list. + CallSites.erase(CallSites.begin() + CSi); --CSi; Changed = true; Index: test/Transforms/Inline/inline-loop.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/inline-loop.ll @@ -0,0 +1,376 @@ +; Check default behavior of aggressive inline. +; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 < %s | FileCheck %s + +; Check default behavior without aggressive inline. +; RUN: opt -S -inline -inline-threshold=225 < %s | FileCheck %s -check-prefix=NO-AGGRESSIVE + +; For aggresive inline, by default small loop bonus is 400, so we check if big +; function can't be inlined even if it is inside a small loop. +; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 -inline-small-loop-bonus=200 < %s | FileCheck %s -check-prefix=SMALL-LOOP + +; For aggresive inline, by default small loop size is 2, i.e. 2 BBs inside loop +; body, so we check big function can still be inlined even if the loop body is +; with 3 BBs. +; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 -inline-small-loop-size=3 < %s | FileCheck %s -check-prefix=SMALL-LOOP-SIZE + +; For aggressive inline, by default loop bonus is 400, i.e. enlarge threshold 4 +; times, so we check big function can be inlined into a big loop if only we +; enlarge loop bonus. +; RUN: opt -S -inline -inline-aggressive -inline-threshold=225 -inline-loop-bonus=400 < %s | FileCheck %s -check-prefix=BIG-LOOP-BONUS + +@a = global i32 4 + +; This function should be larger than threshold 225 and called inside a loop. +define i32 @HotFunction1(i32 %a) { +entry: + %a1 = load volatile i32, i32* @a + %x1 = add i32 %a1, %a1 + %a2 = load volatile i32, i32* @a + %x2 = add i32 %x1, %a2 + %a3 = load volatile i32, i32* @a + %x3 = add i32 %x2, %a3 + %a4 = load volatile i32, i32* @a + %x4 = add i32 %x3, %a4 + %a5 = load volatile i32, i32* @a + %x5 = add i32 %x4, %a5 + %a6 = load volatile i32, i32* @a + %x6 = add i32 %x5, %a6 + %a7 = load volatile i32, i32* @a + %x7 = add i32 %x6, %a7 + %a8 = load volatile i32, i32* @a + %x8 = add i32 %x7, %a8 + %a9 = load volatile i32, i32* @a + %x9 = add i32 %x8, %a9 + %a10 = load volatile i32, i32* @a + %x10 = add i32 %x9, %a10 + %a11 = load volatile i32, i32* @a + %x11 = add i32 %x10, %a11 + %a12 = load volatile i32, i32* @a + %x12 = add i32 %x11, %a12 + + %a21 = load volatile i32, i32* @a + %x21 = add i32 %x12, %a21 + %a22 = load volatile i32, i32* @a + %x22 = add i32 %x21, %a22 + %a23 = load volatile i32, i32* @a + %x23 = add i32 %x22, %a23 + %a24 = load volatile i32, i32* @a + %x24 = add i32 %x23, %a24 + %a25 = load volatile i32, i32* @a + %x25 = add i32 %x24, %a25 + %a26 = load volatile i32, i32* @a + %x26 = add i32 %x25, %a26 + %a27 = load volatile i32, i32* @a + %x27 = add i32 %x26, %a27 + %a28 = load volatile i32, i32* @a + %x28 = add i32 %x27, %a28 + %a29 = load volatile i32, i32* @a + %x29 = add i32 %x28, %a29 + %a30 = load volatile i32, i32* @a + %x30 = add i32 %x29, %a30 + %a31 = load volatile i32, i32* @a + %x31 = add i32 %x30, %a31 + %a32 = load volatile i32, i32* @a + %x32 = add i32 %x31, %a32 + + %a41 = load volatile i32, i32* @a + %x41 = add i32 %x32, %a41 + %a42 = load volatile i32, i32* @a + %x42 = add i32 %x41, %a42 + %a43 = load volatile i32, i32* @a + %x43 = add i32 %x42, %a43 + %a44 = load volatile i32, i32* @a + %x44 = add i32 %x43, %a44 + %a45 = load volatile i32, i32* @a + %x45 = add i32 %x44, %a45 + %a46 = load volatile i32, i32* @a + %x46 = add i32 %x45, %a46 + %a47 = load volatile i32, i32* @a + %x47 = add i32 %x46, %a47 + %a48 = load volatile i32, i32* @a + %x48 = add i32 %x47, %a48 + %a49 = load volatile i32, i32* @a + %x49 = add i32 %x48, %a49 + %a50 = load volatile i32, i32* @a + %x50 = add i32 %x49, %a50 + %a51 = load volatile i32, i32* @a + %x51 = add i32 %x50, %a51 + %a52 = load volatile i32, i32* @a + %x52 = add i32 %x51, %a52 + + %add = add i32 %x52, %a + ret i32 %add +} + +define i32 @bar1(i32 %n) { +; CHECK-LABEL: @bar1 +; CHECK-NOT: call i32 @HotFunction1(i32 +; NO-AGGRESSIVE-LABEL: @bar1 +; NO-AGGRESSIV: call i32 @HotFunction1(i32 +entry: + %cmp5 = icmp sgt i32 %n, 1 + br i1 %cmp5, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %s.07 = phi i32 [ %s.1, %for.inc ], [ 0, %for.body.preheader ] + %i.06 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ] + %rem = srem i32 %i.06, 3 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %for.inc, label %if.then + +if.then: + %call = tail call i32 @HotFunction1(i32 %i.06) + %add = add nsw i32 %call, %s.07 + br label %for.inc + +for.inc: + %s.1 = phi i32 [ %add, %if.then ], [ %s.07, %for.body ] + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + %s.1.lcssa = phi i32 [ %s.1, %for.inc ] + br label %for.end + +for.end: + %s.0.lcssa = phi i32 [ 0, %entry ], [ %s.1.lcssa, %for.end.loopexit ] + ret i32 %s.0.lcssa +} + +; This function should be larger than threshold 450 and called inside a small +; loop. +define i32 @HotFunction2(i32 %a) { +entry: + %a1 = load volatile i32, i32* @a + %x1 = add i32 %a1, %a1 + %a2 = load volatile i32, i32* @a + %x2 = add i32 %x1, %a2 + %a3 = load volatile i32, i32* @a + %x3 = add i32 %x2, %a3 + %a4 = load volatile i32, i32* @a + %x4 = add i32 %x3, %a4 + %a5 = load volatile i32, i32* @a + %x5 = add i32 %x4, %a5 + %a6 = load volatile i32, i32* @a + %x6 = add i32 %x5, %a6 + %a7 = load volatile i32, i32* @a + %x7 = add i32 %x6, %a7 + %a8 = load volatile i32, i32* @a + %x8 = add i32 %x7, %a8 + %a9 = load volatile i32, i32* @a + %x9 = add i32 %x8, %a9 + %a10 = load volatile i32, i32* @a + %x10 = add i32 %x9, %a10 + %a11 = load volatile i32, i32* @a + %x11 = add i32 %x10, %a11 + %a12 = load volatile i32, i32* @a + %x12 = add i32 %x11, %a12 + + %a21 = load volatile i32, i32* @a + %x21 = add i32 %x12, %a21 + %a22 = load volatile i32, i32* @a + %x22 = add i32 %x21, %a22 + %a23 = load volatile i32, i32* @a + %x23 = add i32 %x22, %a23 + %a24 = load volatile i32, i32* @a + %x24 = add i32 %x23, %a24 + %a25 = load volatile i32, i32* @a + %x25 = add i32 %x24, %a25 + %a26 = load volatile i32, i32* @a + %x26 = add i32 %x25, %a26 + %a27 = load volatile i32, i32* @a + %x27 = add i32 %x26, %a27 + %a28 = load volatile i32, i32* @a + %x28 = add i32 %x27, %a28 + %a29 = load volatile i32, i32* @a + %x29 = add i32 %x28, %a29 + %a30 = load volatile i32, i32* @a + %x30 = add i32 %x29, %a30 + %a31 = load volatile i32, i32* @a + %x31 = add i32 %x30, %a31 + %a32 = load volatile i32, i32* @a + %x32 = add i32 %x31, %a32 + + %a41 = load volatile i32, i32* @a + %x41 = add i32 %x32, %a41 + %a42 = load volatile i32, i32* @a + %x42 = add i32 %x41, %a42 + %a43 = load volatile i32, i32* @a + %x43 = add i32 %x42, %a43 + %a44 = load volatile i32, i32* @a + %x44 = add i32 %x43, %a44 + %a45 = load volatile i32, i32* @a + %x45 = add i32 %x44, %a45 + %a46 = load volatile i32, i32* @a + %x46 = add i32 %x45, %a46 + %a47 = load volatile i32, i32* @a + %x47 = add i32 %x46, %a47 + %a48 = load volatile i32, i32* @a + %x48 = add i32 %x47, %a48 + %a49 = load volatile i32, i32* @a + %x49 = add i32 %x48, %a49 + %a50 = load volatile i32, i32* @a + %x50 = add i32 %x49, %a50 + %a51 = load volatile i32, i32* @a + %x51 = add i32 %x50, %a51 + %a52 = load volatile i32, i32* @a + %x52 = add i32 %x51, %a52 + + %a61 = load volatile i32, i32* @a + %x61 = add i32 %a52, %a61 + %a62 = load volatile i32, i32* @a + %x62 = add i32 %x61, %a62 + %a63 = load volatile i32, i32* @a + %x63 = add i32 %x62, %a63 + %a64 = load volatile i32, i32* @a + %x64 = add i32 %x63, %a64 + %a65 = load volatile i32, i32* @a + %x65 = add i32 %x64, %a65 + %a66 = load volatile i32, i32* @a + %x66 = add i32 %x65, %a66 + %a67 = load volatile i32, i32* @a + %x67 = add i32 %x66, %a67 + %a68 = load volatile i32, i32* @a + %x68 = add i32 %x67, %a68 + %a69 = load volatile i32, i32* @a + %x69 = add i32 %x68, %a69 + %a610 = load volatile i32, i32* @a + %x610 = add i32 %x69, %a610 + %a611 = load volatile i32, i32* @a + %x611 = add i32 %x610, %a611 + %a612 = load volatile i32, i32* @a + %x612 = add i32 %x611, %a612 + + %a621 = load volatile i32, i32* @a + %x621 = add i32 %x612, %a621 + %a622 = load volatile i32, i32* @a + %x622 = add i32 %x621, %a622 + %a623 = load volatile i32, i32* @a + %x623 = add i32 %x622, %a623 + %a624 = load volatile i32, i32* @a + %x624 = add i32 %x623, %a624 + %a625 = load volatile i32, i32* @a + %x625 = add i32 %x624, %a625 + %a626 = load volatile i32, i32* @a + %x626 = add i32 %x625, %a626 + %a627 = load volatile i32, i32* @a + %x627 = add i32 %x626, %a627 + %a628 = load volatile i32, i32* @a + %x628 = add i32 %x627, %a628 + %a629 = load volatile i32, i32* @a + %x629 = add i32 %x628, %a629 + %a630 = load volatile i32, i32* @a + %x630 = add i32 %x629, %a630 + %a631 = load volatile i32, i32* @a + %x631 = add i32 %x630, %a631 + %a632 = load volatile i32, i32* @a + %x632 = add i32 %x631, %a632 + + %a641 = load volatile i32, i32* @a + %x641 = add i32 %x632, %a641 + %a642 = load volatile i32, i32* @a + %x642 = add i32 %x641, %a642 + %a643 = load volatile i32, i32* @a + %x643 = add i32 %x642, %a643 + %a644 = load volatile i32, i32* @a + %x644 = add i32 %x643, %a644 + %a645 = load volatile i32, i32* @a + %x645 = add i32 %x644, %a645 + %a646 = load volatile i32, i32* @a + %x646 = add i32 %x645, %a646 + %a647 = load volatile i32, i32* @a + %x647 = add i32 %x646, %a647 + %a648 = load volatile i32, i32* @a + %x648 = add i32 %x647, %a648 + %a649 = load volatile i32, i32* @a + %x649 = add i32 %x648, %a649 + %a650 = load volatile i32, i32* @a + %x650 = add i32 %x649, %a650 + %a651 = load volatile i32, i32* @a + %x651 = add i32 %x650, %a651 + %a652 = load volatile i32, i32* @a + %x652 = add i32 %x651, %a652 + + %add = add i32 %x652, %a + ret i32 %add +} + +define i32 @bar2(i32 %n) { +; CHECK-LABEL: @bar +; CHECK-NOT: call i32 @HotFunction2(i32 +; NO-AGGRESSIVE-LABEL: @bar2 +; NO-AGGRESSIV: call i32 @HotFunction2(i32 +; SMALL-LOOP-LABEL: @bar2 +; SMALL-LOOP: call i32 @HotFunction2(i32 +entry: + %cmp4 = icmp sgt i32 %n, 1 + br i1 %cmp4, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %s.06 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %i.05 = phi i32 [ %inc, %for.body ], [ 1, %for.body.preheader ] + %call = tail call i32 @HotFunction2(i32 %i.05) + %add = add nsw i32 %call, %s.06 + %inc = add nuw nsw i32 %i.05, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + %add.lcssa = phi i32 [ %add, %for.body ] + br label %for.end + +for.end: + %s.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ] + ret i32 %s.0.lcssa +} + +; This function has loop with > 2BB in loop body, so it can't get enough +; bonus to inline big callee HotFunction2. +define i32 @bar3(i32 %n) { +; CHECK-LABEL: @bar3 +; CHECK: call i32 @HotFunction2(i32 +; SMALL-LOOP-SIZE-LABEL: @bar3 +; SMALL-LOOP-SIZE-NOT: call i32 @HotFunction2(i32 +; BIG-LOOP-BONUS-LABEL: @bar3 +; BIG-LOOP-BONUS-NOT: call i32 @HotFunction2(i32 +entry: + %cmp5 = icmp sgt i32 %n, 1 + br i1 %cmp5, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %s.07 = phi i32 [ %s.1, %for.inc ], [ 0, %for.body.preheader ] + %i.06 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ] + %rem = srem i32 %i.06, 3 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %for.inc, label %if.then + +if.then: + %call = tail call i32 @HotFunction2(i32 %i.06) + %add = add nsw i32 %call, %s.07 + br label %for.inc + +for.inc: + %s.1 = phi i32 [ %add, %if.then ], [ %s.07, %for.body ] + %inc = add nuw nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + %s.1.lcssa = phi i32 [ %s.1, %for.inc ] + br label %for.end + +for.end: + %s.0.lcssa = phi i32 [ 0, %entry ], [ %s.1.lcssa, %for.end.loopexit ] + ret i32 %s.0.lcssa +} + Index: test/Transforms/Inline/inline-misc.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/inline-misc.ll @@ -0,0 +1,128 @@ +; For aggressive inline, by default +; (1) The threshold bonus for callee with constant argument is 200, i.e. +; enlarge threshold 2 times. +; (2) The max number of same callees is 10. i.e. the callees called >10 times +; can't be inlined. +; RUN: opt -inline -S -inline-threshold=75 -inline-aggressive < %s | FileCheck %s +; RUN: opt -inline -S -inline-threshold=75 < %s | FileCheck %s -check-prefix=NO-AGGRESSIVE + +; Negatively check big function can't be inlined without enlarging threshold +; for function with constant argument. +; RUN: opt -inline -S -inline-threshold=75 -inline-aggressive -inline-const-arg-bonus=100 < %s | FileCheck %s -check-prefix=LOW-CONST-ARG-BONUS + +; Lower the max number of same callees to check if -inline-callee-num can work. +; RUN: opt -inline -S -inline-threshold=75 -inline-aggressive -inline-callee-num=9 < %s | FileCheck %s -check-prefix=CALLEE-NUM + +declare i32 @HotFunction(i32) + +define i32 @bar(i32 %n) { +entry: + %cmp29 = icmp sgt i32 %n, 1 + br i1 %cmp29, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %s.031 = phi i32 [ %s.1, %for.inc ], [ 0, %for.body.preheader ] + %i.030 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ] + %rem = srem i32 %i.030, 3 + %tobool = icmp eq i32 %rem, 0 + br i1 %tobool, label %for.inc, label %if.then + +if.then: + %call = tail call i32 @HotFunction(i32 %i.030) + %add = add nsw i32 %call, %s.031 + br label %for.inc + +for.inc: + %s.1 = phi i32 [ %add, %if.then ], [ %s.031, %for.body ] + %inc = add nuw nsw i32 %i.030, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + %s.1.lcssa = phi i32 [ %s.1, %for.inc ] + br label %for.end + +for.end: + %s.0.lcssa = phi i32 [ 0, %entry ], [ %s.1.lcssa, %for.end.loopexit ] + %mul = shl nsw i32 %n, 1 + %call1 = tail call i32 @HotFunction(i32 %mul) + %add2 = add nsw i32 %call1, %s.0.lcssa + %cmp426 = icmp slt i32 %n, 1 + br i1 %cmp426, label %for.body5.preheader, label %for.end13 + +for.body5.preheader: + br label %for.body5 + +for.body5: + %s.228 = phi i32 [ %s.3, %for.inc12 ], [ %add2, %for.body5.preheader ] + %i.127 = phi i32 [ %sub, %for.inc12 ], [ %n, %for.body5.preheader ] + %rem6 = srem i32 %i.127, 3 + %tobool7 = icmp eq i32 %rem6, 0 + br i1 %tobool7, label %for.inc12, label %if.then8 + +if.then8: + %call9 = tail call i32 @HotFunction(i32 %i.127) + %add10 = add nsw i32 %call9, %s.228 + br label %for.inc12 + +for.inc12: + %s.3 = phi i32 [ %add10, %if.then8 ], [ %s.228, %for.body5 ] + %sub = add nsw i32 %i.127, -2 + %cmp4 = icmp slt i32 %sub, 1 + br i1 %cmp4, label %for.body5, label %for.end13.loopexit + +for.end13.loopexit: + %s.3.lcssa = phi i32 [ %s.3, %for.inc12 ] + br label %for.end13 + +for.end13: + %s.2.lcssa = phi i32 [ %add2, %for.end ], [ %s.3.lcssa, %for.end13.loopexit ] + ret i32 %s.2.lcssa +} + +define i32 @foo(i32 %x) { +; CHECK-LABEL: @foo +; CHECK-NOT: call i32 @bar( +; NO-AGGRESSIVE-LABEL: @foo +; NO-AGGRESSIVE: call i32 @bar( +; LOW-CONST-ARG-BONUS-LABEL: @foo +; LOW-CONST-ARG-BONUS: call i32 @bar( +entry: + %call = tail call i32 @bar(i32 1000) + ret i32 %call +} + +define i32 @goo(i32 %x) { +; CHECK-LABEL: @goo +; CHECK-NOT: call i32 @bar( +; NO-AGGRESSIVE-LABEL: @goo +; NO-AGGRESSIVE: call i32 @bar( +; CALLEE-NUM-LABEL: @goo +; CALLEE-NUM: call i32 @bar( +entry: + %a1 = tail call i32 @bar(i32 1000) + %x1 = add i32 %a1, %a1 + %a2 = tail call i32 @bar(i32 1001) + %x2 = add i32 %x1, %a2 + %a3 = tail call i32 @bar(i32 1002) + %x3 = add i32 %x2, %a3 + %a4 = tail call i32 @bar(i32 1003) + %x4 = add i32 %x3, %a4 + %a5 = tail call i32 @bar(i32 1004) + %x5 = add i32 %x4, %a5 + %a6 = tail call i32 @bar(i32 1005) + %x6 = add i32 %x5, %a6 + %a7 = tail call i32 @bar(i32 1006) + %x7 = add i32 %x6, %a7 + %a8 = tail call i32 @bar(i32 1007) + %x8 = add i32 %x7, %a8 + %a9 = tail call i32 @bar(i32 1008) + %x9 = add i32 %x8, %a9 + %a10 = tail call i32 @bar(i32 1009) + %x10 = add i32 %x9, %a10 + + ret i32 %x10 +}