diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -71,6 +71,19 @@ cl::init(45), cl::ZeroOrMore, cl::desc("Threshold for inlining cold callsites")); +static cl::opt InlineEnableCostBenefitAnalysis( + "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(true), + cl::desc("Enable the cost-benefit analysis for the inliner")); + +static cl::opt + InlineSavingsMultiplier("inline-savings-multiplier", cl::Hidden, + cl::init(8), cl::ZeroOrMore, + cl::desc("Inline savings multiplier")); + +static cl::opt InlineSizeAllowance("inline-size-allowance", cl::Hidden, + cl::init(100), cl::ZeroOrMore, + cl::desc("Inline size allowance")); + // We introduce this threshold to help performance of instrumentation based // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. @@ -183,6 +196,9 @@ CallBase &CandidateCall; /// Extension points for handling callsite features. + // Called before a basic block was analyzed. + virtual void onBlockStart(const BasicBlock *BB) {} + /// Called after a basic block was analyzed. virtual void onBlockAnalyzed(const BasicBlock *BB) {} @@ -460,6 +476,12 @@ /// arguments are not counted here. int Cost = 0; + // The cumulative cost at the beginning of the basic block being analyzed. + int CostAtBBStart = 0; + + // The cost of cold basic blocks. + int ColdSize = 0; + bool SingleBB = true; unsigned SROACostSavings = 0; @@ -597,7 +619,17 @@ SROACostSavings += InlineConstants::InstrCost; } + void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } + void onBlockAnalyzed(const BasicBlock *BB) override { + // Keep track of the cost of cold basic blocks. For now, we define a cold + // basic block to be one that's never executed. + if (BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr) { + auto ProfileCount = BFI->getBlockProfileCount(BB); + if (ProfileCount.hasValue() && ProfileCount.getValue() == 0) + ColdSize += Cost - CostAtBBStart; + } + auto *TI = BB->getTerminator(); // If we had any successors at this point, than post-inlining is likely to // have them as well. Note that we assume any basic blocks which existed @@ -628,6 +660,105 @@ InstructionCostDetailMap[I].ThresholdAfter = Threshold; } + // Determine whether we should inline the given call site, taking into account + // both the size cost and the cycle savings. Return None if we don't have + // suficient profiling information to determine. + Optional costBenefitAnalysis() { + if (!PSI || !PSI->hasProfileSummary()) + return None; + + BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; + if (!BFI) + return None; + + auto CallerBB = CandidateCall.getParent(); + BlockFrequencyInfo *CallerBFI = + GetBFI ? &(GetBFI(*(CallerBB->getParent()))) : nullptr; + if (!CallerBFI) + return None; + + auto CallerProfileCount = CallerBFI->getBlockProfileCount(CallerBB); + if (!CallerProfileCount.hasValue()) + return None; + + for (auto &BB : F) { + auto ProfileCount = BFI->getBlockProfileCount(&BB); + if (!ProfileCount.hasValue()) + return None; + } + + // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 + // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by + // falling back to the cost-based metric. + // TODO: Improve this hacky condition. + if (Threshold == 0) + return None; + + // For now, limit to hot call site. + if (!PSI->isHotCallSite(CandidateCall, CallerBFI)) + return None; + + uint64_t WeightedSavings = 0; + for (auto &BB : F) { + uint64_t CurrentSavings = 0; + for (auto &I : BB) { + if (BranchInst *BI = dyn_cast(&I)) { + // Count a conditional branch as savings if it becomes unconditional. + if (BI->isConditional() && + dyn_cast_or_null( + SimplifiedValues.lookup(BI->getCondition()))) { + CurrentSavings += InlineConstants::InstrCost; + } + } else if (Value *V = dyn_cast(&I)) { + // Count an instruction as savings if we can fold it. + if (SimplifiedValues.count(V)) { + CurrentSavings += InlineConstants::InstrCost; + } + } + // TODO: Consider other forms of savings like switch statements, + // indirect calls becoming direct, SROACostSavings, LoadEliminationCost, + // etc. + } + + auto ProfileCount = BFI->getBlockProfileCount(&BB); + assert(ProfileCount.hasValue()); + WeightedSavings += ProfileCount.getValue() * CurrentSavings; + } + + // Compute the weighted savings per call. + auto EntryProfileCount = BFI->getBlockProfileCount(&(F.getEntryBlock())); + assert(EntryProfileCount.hasValue()); + if (EntryProfileCount.getValue()) + WeightedSavings = ((WeightedSavings + EntryProfileCount.getValue() / 2) / + EntryProfileCount.getValue()); + else + WeightedSavings = 0; + + // Compute the total savings for the given call. + WeightedSavings += getCallsiteCost(this->CandidateCall, DL); + WeightedSavings *= CallerProfileCount.getValue(); + + // Remove the cost of the cold basic blocks. + int Size = Cost - ColdSize; + + // Allow tiny callees to be inlined regardless of whether they meet the + // savings threshold. + Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; + + // Return true if the savings justify the cost of inlining. Specifically, + // we evaluate the following inequality: + // + // WeightedSavings PSI->getOrCompHotCountThreshold() + // ----------------- >= ----------------------------------- + // Size InlineSavingsMultiplier + // + // Note that the left hand side is specific to a call site. The right hand + // side is a constant for the entire executable. + auto LHS = WeightedSavings * InlineSavingsMultiplier; + auto RHS = Size * PSI->getOrCompHotCountThreshold(); + return LHS >= RHS; + } + InlineResult finalizeAnalysis() override { // Loops generally act a lot like calls in that they act like barriers to // movement, require a certain amount of setup, etc. So when optimising for @@ -656,6 +787,16 @@ else if (NumVectorInstructions <= NumInstructions / 2) Threshold -= VectorBonus / 2; + if (InlineEnableCostBenefitAnalysis) { + auto CostBenefitAnalysisResult = costBenefitAnalysis(); + if (CostBenefitAnalysisResult.hasValue()) { + if (CostBenefitAnalysisResult.getValue()) + return InlineResult::success(); + else + return InlineResult::failure("Cost over threshold."); + } + } + if (IgnoreThreshold || Cost < std::max(1, Threshold)) return InlineResult::success(); return InlineResult::failure("Cost over threshold."); @@ -2146,6 +2287,8 @@ if (BB->empty()) continue; + onBlockStart(BB); + // Disallow inlining a blockaddress with uses other than strictly callbr. // A blockaddress only has defined behavior for an indirect branch in the // same function, and we do not currently support inlining indirect