diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h --- a/llvm/include/llvm/Analysis/InlineCost.h +++ b/llvm/include/llvm/Analysis/InlineCost.h @@ -15,6 +15,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/InlineModelFeatureMaps.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include #include @@ -270,6 +271,15 @@ ProfileSummaryInfo *PSI = nullptr, OptimizationRemarkEmitter *ORE = nullptr); +/// Get the expanded cost features. The features are returned unconditionally, +/// even if inlining is impossible. +Optional getInliningCostFeatures( + CallBase &Call, TargetTransformInfo &CalleeTTI, + function_ref GetAssumptionCache, + function_ref GetBFI = nullptr, + ProfileSummaryInfo *PSI = nullptr, + OptimizationRemarkEmitter *ORE = nullptr); + /// Minimal filter to detect invalid constructs for inlining. InlineResult isInlineViable(Function &Callee); diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h --- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h +++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h @@ -16,6 +16,61 @@ namespace llvm { +// List of cost features. A "cost" feature is a summand of the heuristic-based +// inline cost, and we define them separately to preserve the original heuristic +// behavior. +#define INLINE_COST_FEATURE_ITERATOR(M) \ + M(SROASavings, "sroa_savings") \ + M(SROALosses, "sroa_losses") \ + M(LoadElimination, "load_elimination") \ + M(CallPenalty, "call_penalty") \ + M(CallArgumentSetup, "call_argument_setup") \ + M(LoadRelativeIntrinsic, "load_relative_intrinsic") \ + M(LoweredCallArgSetup, "lowered_call_arg_setup") \ + M(IndirectCallPenalty, "indirect_call_penalty") \ + M(JumpTablePenalty, "jump_table_penalty") \ + M(CaseClusterPenalty, "case_cluster_penalty") \ + M(SwitchPenalty, "switch_penalty") \ + M(UnsimplifiedCommonInstructions, "unsimplified_common_instructions") \ + M(NumLoops, "num_loops") \ + M(DeadBlocks, "dead_blocks") \ + M(SimplifiedInstructions, "simplified_instructions") \ + M(ConstantArgs, "constant_args") \ + M(ConstantOffsetPtrArgs, "constant_offset_ptr_args") \ + M(CallSiteCost, "callsite_cost") \ + M(ColdCcPenalty, "cold_cc_penalty") \ + M(LastCallToStaticBonus, "last_call_to_static_bonus") \ + M(IsMultipleBlocks, "is_multiple_blocks") \ + M(NestedInlines, "nested_inlines") \ + M(NestedInlineBonus, "nested_inline_bonus") \ + M(Threshold, "threshold") + +// clang-format off +enum class InlineCostFeatureIndex : size_t { +#define POPULATE_INDICES(INDEX_NAME, NAME) INDEX_NAME, + INLINE_COST_FEATURE_ITERATOR(POPULATE_INDICES) +#undef POPULATE_INDICES + + NumberOfFeatures +}; +// clang-format on + +using InlineCostFeatures = + std::array(InlineCostFeatureIndex::NumberOfFeatures)>; + +constexpr bool isHeuristicInlineCostFeature(InlineCostFeatureIndex Feature) { + return Feature != InlineCostFeatureIndex::SROASavings && + Feature != InlineCostFeatureIndex::IsMultipleBlocks && + Feature != InlineCostFeatureIndex::DeadBlocks && + Feature != InlineCostFeatureIndex::SimplifiedInstructions && + Feature != InlineCostFeatureIndex::ConstantArgs && + Feature != InlineCostFeatureIndex::ConstantOffsetPtrArgs && + Feature != InlineCostFeatureIndex::NestedInlines && + Feature != InlineCostFeatureIndex::NestedInlineBonus && + Feature != InlineCostFeatureIndex::Threshold; +} + // List of features. Each feature is defined through a triple: // - the name of an enum member, which will be the feature index // - a textual name, used for Tensorflow model binding (so it needs to match the @@ -33,7 +88,6 @@ "total current number of defined functions in the module") \ M(NrCtantParams, "nr_ctant_params", \ "number of parameters in the call site that are constants") \ - M(CostEstimate, "cost_estimate", "total cost estimate (threshold - free)") \ M(EdgeCount, "edge_count", "total number of calls in the module") \ M(CallerUsers, "caller_users", \ "number of module-internal users of the caller, +1 if the caller is " \ @@ -46,14 +100,29 @@ "number of blocks reached from a conditional instruction, in the callee") \ M(CalleeUsers, "callee_users", \ "number of module-internal users of the callee, +1 if the callee is " \ - "exposed externally") + "exposed externally") \ + M(CostEstimate, "cost_estimate", "total cost estimate (threshold - free)") +// clang-format off enum class FeatureIndex : size_t { +// InlineCost features - these must come first +#define POPULATE_INDICES(INDEX_NAME, NAME) INDEX_NAME, + INLINE_COST_FEATURE_ITERATOR(POPULATE_INDICES) +#undef POPULATE_INDICES + +// Non-cost features #define POPULATE_INDICES(INDEX_NAME, NAME, COMMENT) INDEX_NAME, INLINE_FEATURE_ITERATOR(POPULATE_INDICES) #undef POPULATE_INDICES - NumberOfFeatures + + NumberOfFeatures }; +// clang-format on + +constexpr FeatureIndex +inlineCostFeatureToMlFeature(InlineCostFeatureIndex Feature) { + return static_cast(static_cast(Feature)); +} constexpr size_t NumberOfFeatures = static_cast(FeatureIndex::NumberOfFeatures); diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -5,7 +5,7 @@ # This url points to the most recent most which is known to be compatible with # LLVM. When better models are published, this url should be updated to aid # discoverability. - set(LLVM_INLINER_MODEL_CURRENT_URL "https://github.com/google/ml-compiler-opt/releases/download/inlining-Oz-v0.1/inlining-Oz-acabaf6-v0.1.tar.gz") + set(LLVM_INLINER_MODEL_CURRENT_URL "TO_BE_UPDATED") if (DEFINED LLVM_HAVE_TF_AOT) # If the path is empty, autogenerate the model diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -438,7 +438,6 @@ void dump(); }; - /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer class InlineCostCallAnalyzer final : public CallAnalyzer { @@ -936,6 +935,211 @@ int getCost() { return Cost; } bool wasDecidedByCostBenefit() { return DecidedByCostBenefit; } }; + +class InlineCostFeaturesAnalyzer final : public CallAnalyzer { +private: + InlineCostFeatures Cost = {}; + unsigned SROACostSavingOpportunities = 0; + int VectorBonus = 0; + int SingleBBBonus = 0; + int Threshold = 5; + + DenseMap SROACosts; + + void update(InlineCostFeatureIndex Feature, int64_t Delta) { + Cost[static_cast(Feature)] += Delta; + } + + void set(InlineCostFeatureIndex Feature, int64_t Value) { + Cost[static_cast(Feature)] = Value; + } + + void onDisableSROA(AllocaInst *Arg) override { + auto CostIt = SROACosts.find(Arg); + if (CostIt == SROACosts.end()) + return; + + update(InlineCostFeatureIndex::SROALosses, CostIt->second); + SROACostSavingOpportunities -= CostIt->second; + SROACosts.erase(CostIt); + } + + void onDisableLoadElimination() override { + set(InlineCostFeatureIndex::LoadElimination, 1); + } + + void onCallPenalty() override { + update(InlineCostFeatureIndex::CallPenalty, InlineConstants::CallPenalty); + } + + void onCallArgumentSetup(const CallBase &Call) override { + update(InlineCostFeatureIndex::CallArgumentSetup, + Call.arg_size() * InlineConstants::InstrCost); + } + + void onLoadRelativeIntrinsic() override { + update(InlineCostFeatureIndex::LoadRelativeIntrinsic, + 3 * InlineConstants::InstrCost); + } + + void onLoweredCall(Function *F, CallBase &Call, + bool IsIndirectCall) override { + update(InlineCostFeatureIndex::LoweredCallArgSetup, + Call.arg_size() * InlineConstants::InstrCost); + + if (IsIndirectCall) { + InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0, + /*HintThreshold*/ {}, + /*ColdThreshold*/ {}, + /*OptSizeThreshold*/ {}, + /*OptMinSizeThreshold*/ {}, + /*HotCallSiteThreshold*/ {}, + /*LocallyHotCallSiteThreshold*/ {}, + /*ColdCallSiteThreshold*/ {}, + /*ComputeFullInlineCost*/ true, + /*EnableDeferral*/ true}; + IndirectCallParams.DefaultThreshold = + InlineConstants::IndirectCallThreshold; + + InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, + GetAssumptionCache, GetBFI, PSI, ORE, false, + true); + if (CA.analyze().isSuccess()) { + update(InlineCostFeatureIndex::NestedInlineBonus, CA.getCost()); + update(InlineCostFeatureIndex::NestedInlines, 1); + } + } else { + onCallPenalty(); + } + } + + void accumulateBonus(const InlineCostFeatures &OtherCost) { + size_t NestedInlineIdx = + static_cast(InlineCostFeatureIndex::NestedInlines); + set(InlineCostFeatureIndex::NestedInlines, + std::max(Cost[NestedInlineIdx], OtherCost[NestedInlineIdx] + 1)); + + int64_t Bonus = 0; + for (int I = 0; + I < static_cast(InlineCostFeatureIndex::NumberOfFeatures); ++I) { + if (isHeuristicInlineCostFeature( + static_cast(I)) || + static_cast(I) == + InlineCostFeatureIndex::NestedInlineBonus) { + Bonus += Cost[I]; + } + } + update(InlineCostFeatureIndex::NestedInlineBonus, Bonus); + } + + void onFinalizeSwitch(unsigned JumpTableSize, + unsigned NumCaseCluster) override { + + if (JumpTableSize) { + int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + + 4 * InlineConstants::InstrCost; + update(InlineCostFeatureIndex::JumpTablePenalty, JTCost); + return; + } + + if (NumCaseCluster <= 3) { + update(InlineCostFeatureIndex::CaseClusterPenalty, + NumCaseCluster * 2 * InlineConstants::InstrCost); + return; + } + + int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; + int64_t SwitchCost = + ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; + update(InlineCostFeatureIndex::SwitchPenalty, SwitchCost); + } + + void onMissedSimplification() override { + update(InlineCostFeatureIndex::UnsimplifiedCommonInstructions, + InlineConstants::InstrCost); + } + + void onInitializeSROAArg(AllocaInst *Arg) override { SROACosts[Arg] = 0; } + void onAggregateSROAUse(AllocaInst *Arg) override { + SROACosts.find(Arg)->second += InlineConstants::InstrCost; + SROACostSavingOpportunities += InlineConstants::InstrCost; + } + + void onBlockAnalyzed(const BasicBlock *BB) override { + if (BB->getTerminator()->getNumSuccessors() > 1) + set(InlineCostFeatureIndex::IsMultipleBlocks, 1); + Threshold -= SingleBBBonus; + } + + InlineResult finalizeAnalysis() override { + auto *Caller = CandidateCall.getFunction(); + if (Caller->hasMinSize()) { + DominatorTree DT(F); + LoopInfo LI(DT); + for (Loop *L : LI) { + // Ignore loops that will not be executed + if (DeadBlocks.count(L->getHeader())) + continue; + update(InlineCostFeatureIndex::NumLoops, InlineConstants::CallPenalty); + } + } + set(InlineCostFeatureIndex::DeadBlocks, DeadBlocks.size()); + set(InlineCostFeatureIndex::SimplifiedInstructions, + NumInstructionsSimplified); + set(InlineCostFeatureIndex::ConstantArgs, NumConstantArgs); + set(InlineCostFeatureIndex::ConstantOffsetPtrArgs, + NumConstantOffsetPtrArgs); + set(InlineCostFeatureIndex::SROASavings, SROACostSavingOpportunities); + + if (NumVectorInstructions <= NumInstructions / 10) + update(InlineCostFeatureIndex::Threshold, -1 * VectorBonus); + else if (NumVectorInstructions <= NumInstructions / 2) + update(InlineCostFeatureIndex::Threshold, -1 * (VectorBonus / 2)); + + set(InlineCostFeatureIndex::Threshold, Threshold); + + return InlineResult::success(); + } + + bool shouldStop() override { return false; } + + void onLoadEliminationOpportunity() override { + update(InlineCostFeatureIndex::LoadElimination, 1); + } + + InlineResult onAnalysisStart() override { + update(InlineCostFeatureIndex::CallSiteCost, + -1 * getCallsiteCost(this->CandidateCall, DL)); + + set(InlineCostFeatureIndex::ColdCcPenalty, + (F.getCallingConv() == CallingConv::Cold)); + + // FIXME: we shouldn't repeat this logic in both the Features and Cost + // analyzer - instead, we should abstract it to a common method in the + // CallAnalyzer + int SingleBBBonusPercent = 50; + int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); + Threshold += TTI.adjustInliningThreshold(&CandidateCall); + Threshold *= TTI.getInliningThresholdMultiplier(); + SingleBBBonus = Threshold * SingleBBBonusPercent / 100; + VectorBonus = Threshold * VectorBonusPercent / 100; + Threshold += (SingleBBBonus + VectorBonus); + + return InlineResult::success(); + } + +public: + InlineCostFeaturesAnalyzer( + const TargetTransformInfo &TTI, + function_ref &GetAssumptionCache, + function_ref GetBFI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, + CallBase &Call) + : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {} + + const InlineCostFeatures &features() const { return Cost; } +}; + } // namespace /// Test whether the given value is an Alloca-derived function argument. @@ -2502,6 +2706,19 @@ return CA.getCost(); } +Optional llvm::getInliningCostFeatures( + CallBase &Call, TargetTransformInfo &CalleeTTI, + function_ref GetAssumptionCache, + function_ref GetBFI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { + InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, + ORE, *Call.getCalledFunction(), Call); + auto R = CFA.analyze(); + if (!R.isSuccess()) + return None; + return CFA.features(); +} + Optional llvm::getAttributeBasedInliningDecision( CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, function_ref GetTLI) { diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -43,11 +43,19 @@ "blocking any further inlining."), cl::init(2.0)); +// clang-format off const std::array llvm::FeatureNameMap{ +// InlineCost features - these must come first +#define POPULATE_NAMES(INDEX_NAME, NAME) NAME, + INLINE_COST_FEATURE_ITERATOR(POPULATE_NAMES) +#undef POPULATE_NAMES + +// Non-cost features #define POPULATE_NAMES(INDEX_NAME, NAME, COMMENT) NAME, - INLINE_FEATURE_ITERATOR(POPULATE_NAMES) + INLINE_FEATURE_ITERATOR(POPULATE_NAMES) #undef POPULATE_NAMES }; +// clang-format on const char *const llvm::DecisionName = "inlining_decision"; const char *const llvm::DefaultDecisionName = "inlining_default"; @@ -217,6 +225,12 @@ CostEstimate = *IsCallSiteInlinable; } + const auto CostFeatures = + llvm::getInliningCostFeatures(CB, TIR, GetAssumptionCache); + if (!CostFeatures) { + return std::make_unique(this, CB, ORE, false); + } + if (Mandatory) return getMandatoryAdvice(CB, true); @@ -234,7 +248,6 @@ FunctionLevels[&Caller]); ModelRunner->setFeature(FeatureIndex::NodeCount, NodeCount); ModelRunner->setFeature(FeatureIndex::NrCtantParams, NrCtantParams); - ModelRunner->setFeature(FeatureIndex::CostEstimate, CostEstimate); ModelRunner->setFeature(FeatureIndex::EdgeCount, EdgeCount); ModelRunner->setFeature(FeatureIndex::CallerUsers, CallerBefore.Uses); ModelRunner->setFeature(FeatureIndex::CallerConditionallyExecutedBlocks, @@ -244,6 +257,16 @@ ModelRunner->setFeature(FeatureIndex::CalleeConditionallyExecutedBlocks, CalleeBefore.BlocksReachedFromConditionalInstruction); ModelRunner->setFeature(FeatureIndex::CalleeUsers, CalleeBefore.Uses); + ModelRunner->setFeature(FeatureIndex::CostEstimate, CostEstimate); + + // Add the cost features + for (size_t I = 0; + I < static_cast(InlineCostFeatureIndex::NumberOfFeatures); ++I) { + ModelRunner->setFeature( + inlineCostFeatureToMlFeature(static_cast(I)), + CostFeatures->at(I)); + } + return getAdviceFromModel(CB, ORE); } diff --git a/llvm/lib/Analysis/models/inlining/config.py b/llvm/lib/Analysis/models/inlining/config.py --- a/llvm/lib/Analysis/models/inlining/config.py +++ b/llvm/lib/Analysis/models/inlining/config.py @@ -26,11 +26,42 @@ # int64 features inputs = [ tf.TensorSpec(dtype=tf.int64, shape=(), name=key) for key in [ - 'caller_basic_block_count', 'caller_conditionally_executed_blocks', - 'caller_users', 'callee_basic_block_count', - 'callee_conditionally_executed_blocks', 'callee_users', - 'nr_ctant_params', 'node_count', 'edge_count', 'callsite_height', - 'cost_estimate', 'inlining_default' + 'caller_basic_block_count', + 'caller_conditionally_executed_blocks', + 'caller_users', + 'callee_basic_block_count', + 'callee_conditionally_executed_blocks', + 'callee_users', + 'nr_ctant_params', + 'node_count', + 'edge_count', + 'callsite_height', + 'cost_estimate', + 'inlining_default', + 'sroa_savings', + 'sroa_losses', + 'load_elimination', + 'call_penalty', + 'call_argument_setup', + 'load_relative_intrinsic', + 'lowered_call_arg_setup', + 'indirect_call_penalty', + 'jump_table_penalty', + 'case_cluster_penalty', + 'switch_penalty', + 'unsimplified_common_instructions', + 'num_loops', + 'dead_blocks', + 'simplified_instructions', + 'constant_args', + 'constant_offset_ptr_args', + 'callsite_cost', + 'cold_cc_penalty', + 'last_call_to_static_bonus', + 'is_multiple_blocks', + 'nested_inlines', + 'nested_inline_bonus', + 'threshold', ] ]