diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -348,6 +348,9 @@ /// individual classes of instructions would be better. unsigned getInliningThresholdMultiplier() const; + unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const; + unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const; + /// \returns A value to be added to the inlining threshold. unsigned adjustInliningThreshold(const CallBase *CB) const; @@ -1696,6 +1699,9 @@ const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) = 0; virtual unsigned getInliningThresholdMultiplier() const = 0; + virtual unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const = 0; + virtual unsigned + getInliningCostBenefitAnalysisProfitableMultiplier() const = 0; virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0; virtual int getInlinerVectorBonusPercent() const = 0; virtual unsigned getCallerAllocaCost(const CallBase *CB, @@ -2068,6 +2074,12 @@ unsigned adjustInliningThreshold(const CallBase *CB) override { return Impl.adjustInliningThreshold(CB); } + unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const override { + return Impl.getInliningCostBenefitAnalysisSavingsMultiplier(); + } + unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const override { + return Impl.getInliningCostBenefitAnalysisProfitableMultiplier(); + } int getInlinerVectorBonusPercent() const override { return Impl.getInlinerVectorBonusPercent(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -69,6 +69,10 @@ } unsigned getInliningThresholdMultiplier() const { return 1; } + unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { return 8; } + unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { + return 8; + } unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; } unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { return 0; diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -88,10 +88,21 @@ "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), cl::desc("Enable the cost-benefit analysis for the inliner")); +// InlineSavingsMultiplier overrides per TTI multipliers iff it is +// specified explicitly in command line options. This option is exposed +// for tuning and testing. static cl::opt InlineSavingsMultiplier( "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::desc("Multiplier to multiply cycle savings by during inlining")); +// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is +// specified explicitly in command line options. This option is exposed +// for tuning and testing. +static cl::opt InlineSavingsProfitableMultiplier( + "inline-savings-profitable-multiplier", cl::Hidden, cl::init(4), + cl::desc("A multiplier on top of cycle savings to decide whether the " + "savings won't justify the cost")); + static cl::opt InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), cl::desc("The maximum size of a callee that get's " @@ -815,6 +826,32 @@ return true; } + // A helper function to choose between command line override and default. + unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { + if (InlineSavingsMultiplier.getNumOccurrences()) + return InlineSavingsMultiplier; + return TTI.getInliningCostBenefitAnalysisSavingsMultiplier(); + } + + // A helper function to choose between command line override and default. + unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const { + if (InlineSavingsProfitableMultiplier.getNumOccurrences()) + return InlineSavingsProfitableMultiplier; + return TTI.getInliningCostBenefitAnalysisProfitableMultiplier(); + } + + void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) { + if (std::optional AttrCycleSavings = getStringFnAttrAsInt( + CandidateCall, "inline-cycle-savings-for-test")) { + CycleSavings = *AttrCycleSavings; + } + + if (std::optional AttrRuntimeCost = getStringFnAttrAsInt( + CandidateCall, "inline-runtime-cost-for-test")) { + Size = *AttrRuntimeCost; + } + } + // Determine whether we should inline the given call site, taking into account // both the size cost and the cycle savings. Return std::nullopt if we don't // have sufficient profiling information to determine. @@ -884,29 +921,55 @@ CycleSavings += getCallsiteCost(this->CandidateCall, DL); CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB); - // Remove the cost of the cold basic blocks. + // Remove the cost of the cold basic blocks to model the runtime cost more + // accurately. Both machine block placement and function splitting could + // place cold blocks further from hot blocks. int Size = Cost - ColdSize; // Allow tiny callees to be inlined regardless of whether they meet the // savings threshold. Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; + OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size); CostBenefit.emplace(APInt(128, Size), CycleSavings); - // Return true if the savings justify the cost of inlining. Specifically, - // we evaluate the following inequality: + // Let R be the ratio of CycleSavings to Size. We accept the inlining + // opportunity if R is really high and reject if R is really low. If R is + // somewhere in the middle, we fall back to the cost-based analysis. // - // CycleSavings PSI->getOrCompHotCountThreshold() - // -------------- >= ----------------------------------- - // Size InlineSavingsMultiplier + // Specifically, let R = CycleSavings / Size, we accept the inlining + // opportunity if: // - // Note that the left hand side is specific to a call site. The right hand - // side is a constant for the entire executable. - APInt LHS = CycleSavings; - LHS *= InlineSavingsMultiplier; - APInt RHS(128, PSI->getOrCompHotCountThreshold()); - RHS *= Size; - return LHS.uge(RHS); + // PSI->getOrCompHotCountThreshold() + // R > ------------------------------------------------- + // getInliningCostBenefitAnalysisSavingsMultiplier() + // + // and reject the inlining opportunity if: + // + // PSI->getOrCompHotCountThreshold() + // R <= ---------------------------------------------------- + // getInliningCostBenefitAnalysisProfitableMultiplier() + // + // Otherwise, we fall back to the cost-based analysis. + // + // Implementation-wise, use multiplication (CycleSavings * Multiplier, + // HotCountThreshold * Size) rather than division to avoid precision loss. + APInt Threshold(128, PSI->getOrCompHotCountThreshold()); + Threshold *= Size; + + APInt UpperBoundCycleSavings = CycleSavings; + UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier(); + if (UpperBoundCycleSavings.uge(Threshold)) + return true; + + APInt LowerBoundCycleSavings = CycleSavings; + LowerBoundCycleSavings *= + getInliningCostBenefitAnalysisProfitableMultiplier(); + if (LowerBoundCycleSavings.ult(Threshold)) + return false; + + // Otherwise, fall back to the cost-based analysis. + return std::nullopt; } InlineResult finalizeAnalysis() override { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -212,6 +212,17 @@ return TTIImpl->getInliningThresholdMultiplier(); } +unsigned +TargetTransformInfo::getInliningCostBenefitAnalysisSavingsMultiplier() const { + return TTIImpl->getInliningCostBenefitAnalysisSavingsMultiplier(); +} + +unsigned +TargetTransformInfo::getInliningCostBenefitAnalysisProfitableMultiplier() + const { + return TTIImpl->getInliningCostBenefitAnalysisProfitableMultiplier(); +} + unsigned TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const { return TTIImpl->adjustInliningThreshold(CB); diff --git a/llvm/test/Transforms/Inline/inline-cost-benefit-multiplier-override.ll b/llvm/test/Transforms/Inline/inline-cost-benefit-multiplier-override.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/Inline/inline-cost-benefit-multiplier-override.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -passes='require,cgscc(inline)' -pass-remarks=inline -pass-remarks-missed=inline -inline-savings-multiplier=4 -inline-savings-profitable-multiplier=5 -S 2>&1| FileCheck %s + +; Test that inline cost benefit multipler could be configured from command line. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; @inlined_caleee is inlined by cost-benefit anlysis. +; @not_inlined_callee is not inlined, decided by cost-benefit analysis +; CHECK: remark: :0:0: 'inlined_callee' inlined into 'caller' with (cost=always): benefit over cost +; CHECK: remark: :0:0: 'not_inlined_callee' not inlined into 'caller' because it should never be inlined (cost=never): cost over benefit + +define i32 @inlined_callee(i32 %c) !prof !17 { +entry: + %mul = mul nsw i32 %c, %c + ret i32 %mul +} + +define i32 @not_inlined_callee(i32 %c) !prof !18 { +entry: + %add = add nsw i32 %c, 2 + ret i32 %add +} + +define i32 @caller(i32 %a, i32 %c) !prof !15 { +entry: + %rem = srem i32 %a, 3 + %cmp = icmp eq i32 %rem, 0 + br i1 %cmp, label %if.then, label %if.end, !prof !16 + +if.then: +; CHECK-LABEL: if.then: +; CHECK-NOT: call i32 @inlined_callee + %call = tail call i32 @inlined_callee(i32 %c) "inline-cycle-savings-for-test"="26" "inline-runtime-cost-for-test"="1" + br label %return + +if.end: +; CHECK-LABEL: if.end: +; CHECK: call i32 @not_inlined_callee + %call1 = tail call i32 @not_inlined_callee(i32 %c) "inline-cycle-savings-for-test"="19" "inline-runtime-cost-for-test"="1" + br label %return + +return: + %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.end ] + ret i32 %retval.0 +} + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 990000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2} +!15 = !{!"function_entry_count", i64 500} +!16 = !{!"branch_weights", i32 1, i32 2} +!17 = !{!"function_entry_count", i64 200} +!18 = !{!"function_entry_count", i64 400}