diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -71,6 +71,20 @@ cl::init(45), cl::ZeroOrMore, cl::desc("Threshold for inlining cold callsites")); +static cl::opt InlineEnableCostBenefitAnalysis( + "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(true), + cl::desc("Enable the cost-benefit analysis for the inliner")); + +static cl::opt InlineSavingsMultiplier( + "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore, + cl::desc("Multiplier to multiply cycle savings by during inlining")); + +static cl::opt + InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), + cl::ZeroOrMore, + cl::desc("The maximum size of a callee that get's " + "inlined without sufficient cycle savings")); + // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. @@ -183,6 +197,9 @@ CallBase &CandidateCall; /// Extension points for handling callsite features. + // Called before a basic block was analyzed. + virtual void onBlockStart(const BasicBlock *BB) {} + /// Called after a basic block was analyzed. virtual void onBlockAnalyzed(const BasicBlock *BB) {} @@ -460,6 +477,15 @@ /// arguments are not counted here. int Cost = 0; + // The cumulative cost at the beginning of the basic block being analyzed. At + // the end of analyzing each basic block, "CostAtBBStart - Cost" represents + // the size of that basic block. + int CostAtBBStart = 0; + + // The static size of live but cold basic blocks. This is "static" in the + // sense that it's not weighted by profile counts at all. + int ColdSize = 0; + bool SingleBB = true; unsigned SROACostSavings = 0; @@ -597,7 +623,19 @@ SROACostSavings += InlineConstants::InstrCost; } + void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } + void onBlockAnalyzed(const BasicBlock *BB) override { + if (PSI && PSI->hasProfileSummary()) { + // Keep track of the static size of live but cold basic blocks. For now, + // we define a cold basic block to be one that's never executed. + if (BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr) { + auto ProfileCount = BFI->getBlockProfileCount(BB); + if (ProfileCount.hasValue() && ProfileCount.getValue() == 0) + ColdSize += Cost - CostAtBBStart; + } + } + auto *TI = BB->getTerminator(); // If we had any successors at this point, than post-inlining is likely to // have them as well. Note that we assume any basic blocks which existed @@ -628,6 +666,114 @@ InstructionCostDetailMap[I].ThresholdAfter = Threshold; } + // Determine whether we should inline the given call site, taking into account + // both the size cost and the cycle savings. Return None if we don't have + // suficient profiling information to determine. + Optional costBenefitAnalysis() { + if (!PSI || !PSI->hasProfileSummary()) + return None; + + BlockFrequencyInfo *CalleeBFI = GetBFI ? &(GetBFI(F)) : nullptr; + if (!CalleeBFI) + return None; + + auto *CallerBB = CandidateCall.getParent(); + BlockFrequencyInfo *CallerBFI = + GetBFI ? &(GetBFI(*(CallerBB->getParent()))) : nullptr; + if (!CallerBFI) + return None; + + auto EntryProfileCount = + CalleeBFI->getBlockProfileCount(&(F.getEntryBlock())); + if (!EntryProfileCount.hasValue()) + return None; + + auto CallerProfileCount = CallerBFI->getBlockProfileCount(CallerBB); + if (!CallerProfileCount.hasValue()) + return None; + + // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 + // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by + // falling back to the cost-based metric. + // TODO: Improve this hacky condition. + if (Threshold == 0) + return None; + + // For now, limit to hot call site. + if (!PSI->isHotCallSite(CandidateCall, CallerBFI)) + return None; + + // The cycle savings expressed as the sum of InlineConstants::InstrCost + // multiplied by the estimated dynamic count of each instruction we can + // avoid. Savings come from the call site cost, such as argument setup and + // the call instruction, as well as the instructions that are folded. + // + // We use 128-bit APInt here to avoid potential overflow. This variable + // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case + // assumes that we can avoid or fold a billion instructions, each with a + // profile count of 10^^15 -- roughly the number of cycles for a 24-hour + // period on a 4GHz machine. + APInt CycleSavings(128, 0); + + for (auto &BB : F) { + APInt CurrentSavings(128, 0); + for (auto &I : BB) { + if (BranchInst *BI = dyn_cast(&I)) { + // Count a conditional branch as savings if it becomes unconditional. + if (BI->isConditional() && + dyn_cast_or_null( + SimplifiedValues.lookup(BI->getCondition()))) { + CurrentSavings += InlineConstants::InstrCost; + } + } else if (Value *V = dyn_cast(&I)) { + // Count an instruction as savings if we can fold it. + if (SimplifiedValues.count(V)) { + CurrentSavings += InlineConstants::InstrCost; + } + } + // TODO: Consider other forms of savings like switch statements, + // indirect calls becoming direct, SROACostSavings, LoadEliminationCost, + // etc. + } + + auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB); + assert(ProfileCount.hasValue()); + CurrentSavings *= ProfileCount.getValue(); + CycleSavings += CurrentSavings; + } + + // Compute the weighted savings per call. + auto EntryCount = EntryProfileCount.getValue(); + CycleSavings += EntryCount / 2; + CycleSavings = CycleSavings.udiv(EntryCount); + + // Compute the total savings for the given call. + CycleSavings += getCallsiteCost(this->CandidateCall, DL); + CycleSavings *= CallerProfileCount.getValue(); + + // Remove the cost of the cold basic blocks. + int Size = Cost - ColdSize; + + // Allow tiny callees to be inlined regardless of whether they meet the + // savings threshold. + Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; + + // Return true if the savings justify the cost of inlining. Specifically, + // we evaluate the following inequality: + // + // CycleSavings PSI->getOrCompHotCountThreshold() + // -------------- >= ----------------------------------- + // Size InlineSavingsMultiplier + // + // Note that the left hand side is specific to a call site. The right hand + // side is a constant for the entire executable. + APInt LHS = CycleSavings; + LHS *= InlineSavingsMultiplier; + APInt RHS(128, PSI->getOrCompHotCountThreshold()); + RHS *= Size; + return LHS.uge(RHS); + } + InlineResult finalizeAnalysis() override { // Loops generally act a lot like calls in that they act like barriers to // movement, require a certain amount of setup, etc. So when optimising for @@ -656,6 +802,16 @@ else if (NumVectorInstructions <= NumInstructions / 2) Threshold -= VectorBonus / 2; + if (InlineEnableCostBenefitAnalysis) { + auto CostBenefitAnalysisResult = costBenefitAnalysis(); + if (CostBenefitAnalysisResult.hasValue()) { + if (CostBenefitAnalysisResult.getValue()) + return InlineResult::success(); + else + return InlineResult::failure("Cost over threshold."); + } + } + if (IgnoreThreshold || Cost < std::max(1, Threshold)) return InlineResult::success(); return InlineResult::failure("Cost over threshold."); @@ -2146,6 +2302,8 @@ if (BB->empty()) continue; + onBlockStart(BB); + // Disallow inlining a blockaddress with uses other than strictly callbr. // A blockaddress only has defined behavior for an indirect branch in the // same function, and we do not currently support inlining indirect