Index: llvm/CMakeLists.txt =================================================================== --- llvm/CMakeLists.txt +++ llvm/CMakeLists.txt @@ -896,6 +896,34 @@ llvm_replace_compiler_option(CMAKE_CXX_FLAGS_RELEASE "-O3" "-O2") endif() +set(LLVM_USE_ML_POLICY "" CACHE STRING + "Opt in to using machine learned optimization policies. '' (default) to opt out. Use 'Rel' for using pre-trained policy, 'Dev' for building LLVM for training purposes.") + +string(TOUPPER "${LLVM_USE_ML_POLICY}" LLVM_USE_ML_POLICY) + +if (LLVM_USE_ML_POLICY AND + NOT LLVM_USE_ML_POLICY MATCHES "^(REL|DEV|)$") + message(FATAL_ERROR "Invalid value for LLVM_USE_ML_POLICY: ${LLVM_USE_ML_POLICY}") +endif() + +if (LLVM_USE_ML_POLICY STREQUAL "DEV") + add_definitions("-DLLVM_USE_ML_POLICY_DEV") + set(LLVM_TF_C_LIB "" CACHE PATH "Path to TensorFlow C library install") + include_directories(${LLVM_TF_C_LIB}/include) + find_library(tensorflow_c_api tensorflow PATHS ${LLVM_TF_C_LIB}/lib) +endif () + +if (LLVM_USE_ML_POLICY STREQUAL "REL") + add_definitions("-DLLVM_USE_ML_POLICY_REL") + set(LLVM_TF_AOT_COMPILER "saved_model_cli" CACHE PATH + "Path/command line for TensorFlow AOT compiler") + set(LLVM_TF_AOT_RUNTIME "" CACHE PATH "Path to TensorFlow pip install dir") + include_directories(${LLVM_TF_AOT_RUNTIME}/include) + add_subdirectory(${LLVM_TF_AOT_RUNTIME}/xla_aot_runtime_src + ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime) + add_llvm_library(LLVMtf_aot_runtime $) +endif () + # Put this before tblgen. Else we have a circular dependence. add_subdirectory(lib/Demangle) add_subdirectory(lib/Support) Index: llvm/cmake/modules/TensorFlowCompile.cmake =================================================================== --- /dev/null +++ llvm/cmake/modules/TensorFlowCompile.cmake @@ -0,0 +1,18 @@ +function(tfcompile model tag_set signature_def_key fname cpp_class) + if (IS_ABSOLUTE ${model}) + set(LLVM_ML_MODELS_ABSOLUTE ${model}) + else() + set(LLVM_ML_MODELS_ABSOLUTE + ${CMAKE_CURRENT_SOURCE_DIR}/${model}) + endif() + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${fname}.o ${CMAKE_CURRENT_BINARY_DIR}/${fname}.h + COMMAND "XLA_FLAGS=\"--xla_cpu_multi_thread_eigen=false\"" ${LLVM_TF_AOT_COMPILER} aot_compile_cpu + --dir ${LLVM_ML_MODELS_ABSOLUTE} + --tag_set ${tag_set} + --signature_def_key ${signature_def_key} + --output_prefix ${CMAKE_CURRENT_BINARY_DIR}/${fname} + --cpp_class ${cpp_class} + ) + + set(TF_AOT_OBJ ${CMAKE_CURRENT_BINARY_DIR}/${fname}.o PARENT_SCOPE) +endfunction() \ No newline at end of file Index: llvm/include/llvm/Analysis/InlineCost.h =================================================================== --- llvm/include/llvm/Analysis/InlineCost.h +++ llvm/include/llvm/Analysis/InlineCost.h @@ -236,6 +236,16 @@ function_ref GetTLI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE); +Optional getTrivialInliningDecision( + CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, + std::function &GetAssumptionCache, + function_ref GetTLI); + +Optional getInliningCostEstimate( + CallBase &Call, TargetTransformInfo &CalleeTTI, + std::function &GetAssumptionCache, + Optional> GetBFI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE); /// Minimal filter to detect invalid constructs for inlining. InlineResult isInlineViable(Function &Callee); } // namespace llvm Index: llvm/include/llvm/Analysis/ML/IRToNativeSizeLearning.h =================================================================== --- /dev/null +++ llvm/include/llvm/Analysis/ML/IRToNativeSizeLearning.h @@ -0,0 +1,50 @@ +#ifndef LLVM_ML_IRTONATIVESIZELEARNING_H_ +#define LLVM_ML_IRTONATIVESIZELEARNING_H_ + +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +class IRToNativeSizeLearning { +public: + struct FunctionFeatures { + static const size_t FeatureCount = 214; + int32_t InitialSize = 0; + int32_t Blocks = 0; + int32_t Loops = 0; + int32_t Calls = 0; + bool IsLocal = false; + bool IsLinkOnceODR = false; + bool IsLinkOnce = false; + int32_t MaxLoopDepth = 0; + int32_t MaxDomTreeLevel = 0; + std::vector InstructionHistogram; + std::vector InstructionPairHistogram; + void FillTensor(int32_t *Ptr) const; + }; + + IRToNativeSizeLearning() = default; + + static FunctionFeatures getFunctionFeatures(Function &F, + FunctionAnalysisManager &FAM); + +private: + static unsigned getMaxInstructionID(); +}; + +raw_ostream & +operator<<(raw_ostream &Out, + const IRToNativeSizeLearning::FunctionFeatures &Features); + +} // namespace llvm +#endif // LLVM_ML_IRTONATIVESIZELEARNING_H_ Index: llvm/include/llvm/Analysis/ML/InliningAdvisor.h =================================================================== --- /dev/null +++ llvm/include/llvm/Analysis/ML/InliningAdvisor.h @@ -0,0 +1,87 @@ +//===- InlinerML.h - ML infrastructure for inliner --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +#ifndef LLVM_ML_INLINERML_H_ +#define LLVM_ML_INLINERML_H_ + +#include +#include + +#include "llvm/IR/PassManager.h" + +namespace llvm { +class CallBase; +class Function; +class Module; +class PreservedAnalyses; + +struct PendingInliningRecordImpl; +class PendingInliningRecord { +public: + PendingInliningRecord(); + PendingInliningRecord(PendingInliningRecord &&); + PendingInliningRecord &operator=(PendingInliningRecord &&); + + PendingInliningRecord(const PendingInliningRecord &) = delete; + + void recordInlining(bool CalleeWasDeleted, bool SiteWasInlined); + ~PendingInliningRecord(); + operator bool() const { return !!Impl; } + +private: + friend class InliningAdvisorImpl; + PendingInliningRecord(PendingInliningRecordImpl *Impl); + std::unique_ptr Impl; +}; + +class InliningAdvisorImpl; +class InliningAdvisor { +public: + InliningAdvisor(Module &M, ModuleAnalysisManager &MAM); + InliningAdvisor(InliningAdvisor &&); + + ~InliningAdvisor(); + + PendingInliningRecord shouldInline(CallBase *CB, + bool &AlternativeRecommendation, + bool Mandatory, int CostEstimate); + + bool invalidate(Module &, const PreservedAnalyses &, + ModuleAnalysisManager::Invalidator &) { + // InliningAdvisor must be preserved across analysis invalidations. + return false; + } + + void OnPassEntry(); + void OnPassExit(); + void OnSuccessfulInlining(const Function *F); + + void OnAllInliningCompleted(); + void OnFunctionDeleted(Function *F); + +private: + std::unique_ptr Impl; +}; + +class InliningAdvisorAnalysis + : public AnalysisInfoMixin { +public: + using Result = InliningAdvisor; + Result run(Module &M, ModuleAnalysisManager &MAM) { return Result(M, MAM); } + static AnalysisKey Key; +}; + +class InliningAdvisorCleanupPass + : public PassInfoMixin { +public: + InliningAdvisorCleanupPass() = default; + PreservedAnalyses run(Module &, ModuleAnalysisManager &); +}; + +} // namespace llvm +#endif // LLVM_ML_INLINERML_H_ Index: llvm/include/llvm/Passes/PassBuilder.h =================================================================== --- llvm/include/llvm/Passes/PassBuilder.h +++ llvm/include/llvm/Passes/PassBuilder.h @@ -344,6 +344,12 @@ ThinLTOPhase Phase, bool DebugLogging = false); + /// Construct the module pipeline that performs inlining as well as + /// the inlining-driven cleanups. + ModulePassManager buildInlinerPipeline(OptimizationLevel Level, + ThinLTOPhase Phase, + bool DebugLogging = false); + /// Construct the core LLVM module optimization pipeline. /// /// This pipeline focuses on optimizing the execution speed of the IR. It Index: llvm/include/llvm/Transforms/IPO/Inliner.h =================================================================== --- llvm/include/llvm/Transforms/IPO/Inliner.h +++ llvm/include/llvm/Transforms/IPO/Inliner.h @@ -94,12 +94,14 @@ /// passes be composed to achieve the same end result. class InlinerPass : public PassInfoMixin { public: - InlinerPass(InlineParams Params = getInlineParams()) - : Params(std::move(Params)) {} + InlinerPass(InlineParams Params = getInlineParams(), + bool MandatoryOnly = false) + : Params(std::move(Params)), MandatoryOnly(MandatoryOnly) {} ~InlinerPass(); InlinerPass(InlinerPass &&Arg) : Params(std::move(Arg.Params)), - ImportedFunctionsStats(std::move(Arg.ImportedFunctionsStats)) {} + ImportedFunctionsStats(std::move(Arg.ImportedFunctionsStats)), + MandatoryOnly(Arg.MandatoryOnly) {} PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR); @@ -107,6 +109,7 @@ private: InlineParams Params; std::unique_ptr ImportedFunctionsStats; + const bool MandatoryOnly; }; } // end namespace llvm Index: llvm/lib/Analysis/CMakeLists.txt =================================================================== --- llvm/lib/Analysis/CMakeLists.txt +++ llvm/lib/Analysis/CMakeLists.txt @@ -1,3 +1,8 @@ +if (NOT LLVM_USE_ML_POLICY STREQUAL "") + add_subdirectory(ML) + set(EXTRADEPS LLVMMLPolicies) +endif() + add_llvm_component_library(LLVMAnalysis AliasAnalysis.cpp AliasAnalysisEvaluator.cpp @@ -39,6 +44,7 @@ IVUsers.cpp IndirectCallPromotionAnalysis.cpp InlineCost.cpp + InliningAdvisor.cpp InstCount.cpp InstructionPrecedenceTracking.cpp InstructionSimplify.cpp @@ -105,4 +111,8 @@ DEPENDS intrinsics_gen + ${EXTRADEPS} + + LINK_LIBS + ${EXTRADEPS} ) Index: llvm/lib/Analysis/InlineCost.cpp =================================================================== --- llvm/lib/Analysis/InlineCost.cpp +++ llvm/lib/Analysis/InlineCost.cpp @@ -401,7 +401,10 @@ /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer -class InlineCostCallAnalyzer final : public CallAnalyzer { +class InlineCostCallAnalyzer : public CallAnalyzer { + /// FIXME(mtrofin): remove the 'protected' here, only needed for + /// CompleteInlineCostAnalyzer +protected: const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; const bool ComputeFullInlineCost; int LoadEliminationCost = 0; @@ -704,6 +707,64 @@ int getThreshold() { return Threshold; } int getCost() { return Cost; } }; + +/// FIXME(mtrofin): point in time solution for evaluating the aggregate cost +/// of a call site, and distinguishing between 'too expensive' and 'invalid'. +/// We delegate to InlineCostAnalyzer, which doesn't worry about cost, if +/// InlineParams have ComputeFullInlineCost == true, until finalizeAnalisis. +/// The long term solution we want here is to avoid cost calculation altogether, +/// and, instead, collect the individual features (i.e. DCEd instructions, nr +/// loops, indirect calls - the basic blocks on which cost is calculated) +class CompleteInlineCostAnalyzer final : public InlineCostCallAnalyzer { +public: + CompleteInlineCostAnalyzer( + const TargetTransformInfo &TTI, + std::function &GetAssumptionCache, + Optional> &GetBFI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, + CallBase &Call, + const InlineParams Params = {/* DefaultThreshold*/ 0, + /*HintThreshold*/ {}, + /*ColdThreshold*/ {}, + /*OptSizeThreshold*/ {}, + /*OptMinSizeThreshold*/ {}, + /*HotCallSiteThreshold*/ {}, + /*LocallyHotCallSiteThreshold*/ {}, + /*ColdCallSiteThreshold*/ {}, + /* ComputeFullInlineCost*/ true}, + bool BoostIndirect = true) + : InlineCostCallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, + Callee, Call, Params, BoostIndirect) {} + + // Ignore parent's result, and just return success. + InlineResult finalizeAnalysis() override { + InlineCostCallAnalyzer::finalizeAnalysis(); + return InlineResult::success(); + } + + // Same as parent's onLoweredCall, just that it uses itself internally. Not + // worth doing something more elegant, because this implementation will go + // away. + void onLoweredCall(Function *F, CallBase &Call, + bool IsIndirectCall) override { + addCost(Call.arg_size() * InlineConstants::InstrCost); + + if (IsIndirectCall && BoostIndirectCalls) { + auto IndirectCallParams = Params; + IndirectCallParams.DefaultThreshold = + InlineConstants::IndirectCallThreshold; + CompleteInlineCostAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, + *F, Call, IndirectCallParams, false); + if (CA.analyze().isSuccess()) { + // We were able to inline the indirect call! Subtract the cost from the + // threshold to get the bonus we want to apply, but don't go below zero. + Cost -= std::max(0, CA.getThreshold() - CA.getCost()); + } + } else + // Otherwise simply add the cost for merely making the call. + addCost(InlineConstants::CallPenalty); + } +}; } // namespace /// Test whether the given value is an Alloca-derived function argument. @@ -2210,14 +2271,23 @@ GetAssumptionCache, GetBFI, GetTLI, PSI, ORE); } -InlineCost llvm::getInlineCost( - CallBase &Call, Function *Callee, const InlineParams &Params, - TargetTransformInfo &CalleeTTI, +Optional llvm::getInliningCostEstimate( + CallBase &Call, TargetTransformInfo &CalleeTTI, std::function &GetAssumptionCache, Optional> GetBFI, - function_ref GetTLI, ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { + CompleteInlineCostAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, + *Call.getCalledFunction(), Call); + auto R = CA.analyze(); + if (!R.isSuccess()) + return {}; + return CA.getCost(); +} +Optional llvm::getTrivialInliningDecision( + CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, + std::function &GetAssumptionCache, + function_ref GetTLI) { // Cannot inline indirect calls. if (!Callee) return llvm::InlineCost::getNever("indirect call"); @@ -2272,8 +2342,25 @@ if (Call.isNoInline()) return llvm::InlineCost::getNever("noinline call site attribute"); + return {}; +} + +InlineCost llvm::getInlineCost( + CallBase &Call, Function *Callee, const InlineParams &Params, + TargetTransformInfo &CalleeTTI, + std::function &GetAssumptionCache, + Optional> GetBFI, + function_ref GetTLI, + ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { + auto TrivialDecision = getTrivialInliningDecision(Call, Callee, CalleeTTI, + GetAssumptionCache, GetTLI); + + if (TrivialDecision.hasValue()) + return TrivialDecision.getValue(); + LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() - << "... (caller:" << Caller->getName() << ")\n"); + << "... (caller:" << Call.getCaller()->getName() + << ")\n"); InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, Call, Params); Index: llvm/lib/Analysis/InliningAdvisor.cpp =================================================================== --- /dev/null +++ llvm/lib/Analysis/InliningAdvisor.cpp @@ -0,0 +1,63 @@ +//===- InlinerMLDefault.cpp - Common implementation -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements InlinerML APIs for the case we do not want ML heuristics +// The goal is to keep InlinerML.cpp simple, without conditional compilation, +// and oportunisitically factor out some common APIs. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/ML/InliningAdvisor.h" + +namespace llvm { + +AnalysisKey InliningAdvisorAnalysis::Key; + +PreservedAnalyses InliningAdvisorCleanupPass::run(Module &M, + ModuleAnalysisManager &MAM) { + InliningAdvisor *Advisor = MAM.getCachedResult(M); + if (Advisor) + Advisor->OnAllInliningCompleted(); + return PreservedAnalyses::all(); +} + +#if !LLVM_USE_ML_POLICY_DEV && !LLVM_USE_ML_POLICY_REL + +class InliningAdvisorImpl {}; +struct PendingInliningRecordImpl {}; + +void PendingInliningRecord::recordInlining(bool, bool) {} + +InliningAdvisor::InliningAdvisor(Module &, ModuleAnalysisManager &) + : Impl(nullptr) {} + +PendingInliningRecord::PendingInliningRecord(PendingInliningRecordImpl *) + : PendingInliningRecord() {} + +PendingInliningRecord InliningAdvisor::shouldInline(CallBase *, bool &, bool, + int) { + return PendingInliningRecord(); +} +void InliningAdvisor::OnPassEntry() {} +void InliningAdvisor::OnPassExit() {} +void InliningAdvisor::OnSuccessfulInlining(const Function *) {} +void InliningAdvisor::OnAllInliningCompleted() {} +void InliningAdvisor::OnFunctionDeleted(Function *) {} + +PendingInliningRecord::~PendingInliningRecord() = default; +PendingInliningRecord::PendingInliningRecord() = default; +PendingInliningRecord::PendingInliningRecord(PendingInliningRecord &&) = + default; +PendingInliningRecord & +PendingInliningRecord::operator=(PendingInliningRecord &&) = default; + +InliningAdvisor::~InliningAdvisor() = default; +InliningAdvisor::InliningAdvisor(InliningAdvisor &&) = default; +#endif +} // namespace llvm Index: llvm/lib/Analysis/ML/CMakeLists.txt =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/CMakeLists.txt @@ -0,0 +1,31 @@ +set(SRCS InliningAdvisor.cpp) + +if (LLVM_USE_ML_POLICY STREQUAL "DEV") + set(SRCS ${SRCS} + IRToNativeSizeLearning.cpp + TFUtils.cpp + ) + set(EXTRA_DEPS ${tensorflow_c_api}) +endif () + +if (LLVM_USE_ML_POLICY STREQUAL "REL") + include(TensorFlowCompile) + tfcompile(models/inliner serve action InlinerSizeModel llvm::InlinerSizeModel) + set(SRCS ${SRCS} ${TF_AOT_OBJ}) + set(EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) + set(EXTRA_DEPS LLVMtf_aot_runtime) + set(LLVM_OPTIONAL_SOURCES IRToNativeSizeLearning.cpp TFUtils.cpp) +endif () + +add_llvm_library(LLVMMLPolicies STATIC + ${SRCS} + + ADDITIONAL_HEADER_DIRS + ${EXTRA_INCLUDES} + + DEPS + ${EXTRA_DEPS} + + LINK_LIBS + ${EXTRA_DEPS} + ) Index: llvm/lib/Analysis/ML/IRToNativeSizeLearning.cpp =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/IRToNativeSizeLearning.cpp @@ -0,0 +1,174 @@ +//===- IRToNativeSizeLearning.cpp - Infra to learn IR to native size model-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements feature and label extraction for offline supervised learning +// of a IR to native size model. +// +//===----------------------------------------------------------------------===// +#include "llvm/Analysis/ML/IRToNativeSizeLearning.h" + +#include +#include + +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/PassManager.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +namespace { +size_t getSize(Function &F, TargetTransformInfo &TTI) { + size_t Ret = 0; + for (auto &BB : F) + for (auto &I : BB) + Ret += TTI.getInstructionCost( + &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize); + return Ret; +} + +size_t getSize(Function &F, FunctionAnalysisManager &FAM) { + auto &TTI = FAM.getResult(F); + return getSize(F, TTI); +} + +int getStableID(const Instruction &I) { + switch (I.getOpcode()) { +#define HANDLE_INST(ID, ENUM_NAME, IGNORE) \ + case Instruction::ENUM_NAME: \ + return ID; +#include "llvm/IR/Instruction.def" + default: + return -1; + } +} + +unsigned getMaxDominatorTreeDepth(const Function &F, + const DominatorTree &Tree) { + unsigned Ret = 0; + for (auto &BB : F) + if (auto *TN = Tree.getNode(&BB)) + Ret = std::max(Ret, TN->getLevel()); + return Ret; +} +} // namespace + +llvm::raw_ostream & +llvm::operator<<(llvm::raw_ostream &Out, + const IRToNativeSizeLearning::FunctionFeatures &Features) { + std::vector T(Features.FeatureCount); + Features.FillTensor(T.data()); + for (auto &V : T) + Out << V << ","; + return Out; +} + +unsigned IRToNativeSizeLearning::getMaxInstructionID() { +#define LAST_OTHER_INST(NR) return NR; +#include "llvm/IR/Instruction.def" +} + +IRToNativeSizeLearning::FunctionFeatures +IRToNativeSizeLearning::getFunctionFeatures(Function &F, + FunctionAnalysisManager &FAM) { + static std::vector> InterestingSuccessions( + {{1, 34}, {15, 27}, {53, 53}, {53, 34}, {1, 11}, {32, 2}, {2, 48}, + {28, 48}, {1, 45}, {49, 32}, {57, 56}, {55, 53}, {1, 28}, {57, 34}, + {1, 1}, {32, 28}, {32, 15}, {49, 28}, {53, 1}, {2, 53}, {48, 34}, + {28, 53}, {2, 32}, {1, 40}, {32, 48}, {29, 56}, {56, 32}, {55, 56}, + {48, 56}, {1, 31}, {33, 34}, {2, 28}, {1, 12}, {55, 1}, {31, 31}, + {65, 1}, {33, 56}, {32, 32}, {13, 13}, {1, 26}, {13, 26}, {2, 1}, + {1, 33}, {47, 49}, {64, 1}, {2, 38}, {34, 53}, {48, 2}, {55, 34}, + {34, 32}, {1, 5}, {56, 13}, {2, 2}, {2, 49}, {33, 2}, {49, 39}, + {56, 49}, {33, 49}, {32, 39}, {39, 57}, {29, 33}, {31, 34}, {32, 29}, + {47, 15}, {13, 34}, {2, 33}, {32, 49}, {49, 34}, {56, 33}, {1, 30}, + {33, 33}, {31, 33}, {2, 29}, {56, 7}, {32, 13}, {2, 55}, {56, 56}, + {2, 34}, {1, 42}, {34, 49}, {1, 20}, {32, 33}, {1, 25}, {53, 28}, + {1, 14}, {31, 49}, {28, 2}, {2, 13}, {2, 56}, {1, 32}, {56, 53}, + {65, 65}, {33, 53}, {64, 64}, {13, 2}, {34, 33}, {1, 4}, {49, 2}, + {1, 9}, {56, 1}, {33, 1}, {53, 57}, {32, 53}, {13, 56}, {32, 56}, + {55, 55}, {1, 18}, {49, 56}, {34, 34}, {1, 7}, {56, 64}, {32, 1}, + {13, 33}, {55, 28}, {49, 33}, {57, 57}, {56, 34}, {34, 56}, {33, 32}, + {32, 40}, {1, 29}, {53, 2}, {34, 1}, {32, 34}, {49, 49}, {1, 24}, + {40, 34}, {1, 13}, {38, 34}, {29, 2}, {34, 2}, {1, 39}, {1, 22}, + {1, 27}, {49, 1}, {1, 8}, {56, 2}}); + // We pay for this the first time, then it's sorted and we pay a traversal. + std::sort(InterestingSuccessions.begin(), InterestingSuccessions.end()); + + auto &DomTree = FAM.getResult(F); + FunctionFeatures FF; + size_t InstrCount = getMaxInstructionID() + 1; + FF.InstructionHistogram.resize(InstrCount); + + FF.InstructionPairHistogram.resize(InterestingSuccessions.size()); + + auto StartID = 0; + auto LastID = StartID; + auto getPairIndex = [](size_t a, size_t b) { + auto I = std::find(InterestingSuccessions.begin(), + InterestingSuccessions.end(), std::make_pair(a, b)); + if (I == InterestingSuccessions.end()) + return -1; + return static_cast(std::distance(InterestingSuccessions.begin(), I)); + }; + + // We don't want debug calls, because they'd just add noise. + for (auto &BB : F) { + for (auto I = BB.instructionsWithoutDebug().begin(), + E = BB.instructionsWithoutDebug().end(); + I != E; ++I) { + auto ID = getStableID(*I); + + ++FF.InstructionHistogram[ID]; + int PairIndex = getPairIndex(LastID, ID); + if (PairIndex >= 0) + ++FF.InstructionPairHistogram[PairIndex]; + LastID = ID; + if (isa(*I)) + ++FF.Calls; + } + } + + FF.InitialSize = getSize(F, FAM); + FF.IsLocal = F.hasLocalLinkage(); + FF.IsLinkOnceODR = F.hasLinkOnceODRLinkage(); + FF.IsLinkOnce = F.hasLinkOnceLinkage(); + FF.Blocks = + std::distance(F.getBasicBlockList().begin(), F.getBasicBlockList().end()); + auto &LI = FAM.getResult(F); + FF.Loops = std::distance(LI.begin(), LI.end()); + for (auto &L : LI) + FF.MaxLoopDepth = + std::max(FF.MaxLoopDepth, static_cast(L->getLoopDepth())); + FF.MaxDomTreeLevel = getMaxDominatorTreeDepth(F, DomTree); + return FF; +} + +void IRToNativeSizeLearning::FunctionFeatures::FillTensor(int32_t *Ptr) const { + int Pos = 0; + Ptr[Pos++] = InitialSize; + Ptr[Pos++] = Blocks; + Ptr[Pos++] = Calls; + Ptr[Pos++] = IsLocal; + Ptr[Pos++] = IsLinkOnceODR; + Ptr[Pos++] = IsLinkOnce; + Ptr[Pos++] = Loops; + Ptr[Pos++] = MaxLoopDepth; + Ptr[Pos++] = MaxDomTreeLevel; + for (auto Count : InstructionHistogram) + Ptr[Pos++] = Count; + + for (auto Count : InstructionPairHistogram) + Ptr[Pos++] = Count; +} Index: llvm/lib/Analysis/ML/InliningAdvisor.cpp =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/InliningAdvisor.cpp @@ -0,0 +1,548 @@ +//===- InlinerML.cpp - machine learned inlining heuristics ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the interface between the inliner and a learned model. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "llvm/Analysis/ML/InliningAdvisor.h" + +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" + +#ifdef LLVM_USE_ML_POLICY_DEV +#include "InliningModelRunnerTraining.h" +#elif defined LLVM_USE_ML_POLICY_REL +#include "InliningModelRunnerProduction.h" +#else +#error Unsupported ML Execution Model +#endif + +using namespace llvm; + +static cl::opt + DebugTrainingLog("debug-training-log", cl::Hidden, + cl::desc("Output a debug training log.")); + +static cl::opt SizeIncreaseThreshold( + "ml-advisor-size-increase-threshold", cl::Hidden, + cl::desc("Maximum factor by which expected native size may increase before " + "blocking any further inlining."), + cl::init(2.0)); + +namespace llvm { + +struct CallSiteInfo { + CallSiteInfo(CallBase *CB, unsigned H) : Call(CB), Height(H) {} + CallBase *const Call; + const unsigned Height; +}; + +struct FuncDesc { + // Optionally store the name for debugging. To avoid tying this to any + // Function lifetime considerations, we store it as a std::string + std::string Name; + int BasicBlockCount = 0; + int ConditionallyExecutedBlocks = 0; + int Users = 0; +}; + +static int getNrOfUsers(const Function &F) { + return ((!F.hasLocalLinkage()) ? 1 : 0) + + std::distance(F.user_begin(), F.user_end()); +} + +class InliningAdvisorImpl { +public: + InliningAdvisorImpl(Module &M, ModuleAnalysisManager &MAM); + InliningAdvisorImpl(InliningAdvisorImpl &&) = default; + + ~InliningAdvisorImpl(); + CallGraph *callGraph() const { return CG.get(); } + + PendingInliningRecord shouldInline(CallBase *CB, + bool &AlternativeRecommendation, + bool Mandatory, int CostEstimate) { + return shouldInline(CallSiteInfo(CB, FunctionLevels[CB->getCaller()]), + AlternativeRecommendation, Mandatory, CostEstimate); + } + + PendingInliningRecord shouldInline(const CallSiteInfo &CSI, + bool &AlternativeRecommendation, + bool Mandatory, int CostEstimate); + bool invalidate(Module &, const PreservedAnalyses &, + llvm::ModuleAnalysisManager::Invalidator &) { + // InliningAdvisorImpl must be preserved across analysis invalidations. + return false; + } + bool forceStop() const { return ForceStop; } + size_t getTotalSizeEstimate() const; + + void OnPassEntry() { + for (auto *F : LastInlinedFunctions) { + FuncDescs.reset(F); + NativeSizeEstimates.reset(F); + NrOfCalls.reset(F); + } + LastInlinedFunctions.clear(); + } + + void OnPassExit() { + for (auto *F : LastInlinedFunctions) { + if (DeletedFunctions.find(F) != DeletedFunctions.end()) + continue; + } + } + + void OnSuccessfulInlining(const Function *F) { + LastInlinedFunctions.insert(F); + + FuncDescs.reset(F); + NativeSizeEstimates.reset(F); + NrOfCalls.reset(F); + } + + void OnAllInliningCompleted() { + for (auto *F : DeletedFunctions) + delete (F); + } + + void OnFunctionDeleted(Function *F) { + assert(DeletedFunctions.count(F) == 0 && + "Can only delete a function once."); + DeletedFunctions.insert(F); + } + +private: + friend class InliningAdvisor; + friend PendingInliningRecordImpl; + + template class Memoized { + mutable std::unordered_map Memo; + + public: + Memoized() = default; + Memoized(const Memoized &) = delete; + Memoized(Memoized &&) = default; + void reset(K Key) { Memo.erase(Key); } + // We can't have F as part of the state, because, if it captures state - + // likely the Advisor's - moving or copying the Advisor would result in an + // invalid capture. We could maybe bind the parameters, but this alternative + // is easier for now, since we only need to call get in one place. + V getOrInsert(K Key, std::function F) { + V Default; + auto I = Memo.insert(std::make_pair(Key, Default)); + if (I.second) + I.first->second = F(Key); + return I.first->second; + } + }; + + bool isLogging() const { + return !DebugTrainingLog.empty() || ModelRunner->isLogging(); + } + + unsigned getLocalCalls(const Function &F) const; + size_t getSizeEstimate(const Function &F) const; + size_t getIRSize(const Function &F) const { return F.getInstructionCount(); } + FuncDesc getFuncDesc(const Function &F) const; + size_t getModuleSize() const; + + int64_t NodeCount = 0; + int64_t EdgeCount = 0; + struct InliningRecordForDebugging { + InliningRecordForDebugging(const std::string &CallerName, + const std::string &CalleeName, + const InliningFeatures &Features, + bool SiteWasInlined, int NativeDeltaSize, + bool Mandatory) + : CallerName(CallerName), CalleeName(CalleeName), Features(Features), + SiteWasInlined(SiteWasInlined), NativeDeltaSize(NativeDeltaSize), + Mandatory(Mandatory) {} + const std::string CallerName; + const std::string CalleeName; + const InliningFeatures Features; + const bool SiteWasInlined; + const int NativeDeltaSize; + const bool Mandatory; + }; + std::vector InliningRecords; + std::unordered_set DeletedFunctions; + int32_t InitialNativeSize = 0; + int32_t CurrentNativeSize = 0; + int32_t InitialIRSize = 0; + int32_t CurrentIRSize = 0; + std::unique_ptr CG; + bool ForceStop = false; + FunctionAnalysisManager &FAM; + // Using a unique_ptr to avoid complicated move semantics implementation in + // InliningModelRunner. + std::unique_ptr ModelRunner; + std::unique_ptr SizeEstimator; + + std::map FunctionLevels; + mutable Memoized FuncDescs; + mutable Memoized NativeSizeEstimates; + mutable Memoized NrOfCalls; + std::unordered_set LastInlinedFunctions; +}; + +struct PendingInliningRecordImpl { + PendingInliningRecordImpl(InliningAdvisorImpl *Advisor, + const Function *Caller, const Function *Callee, + bool Mandatory); + void recordInlining(bool CalleeWasDeleted, bool SiteWasInlined); + ~PendingInliningRecordImpl() { + assert(Recorded && "Unrecorded PendingInliningRecord"); + } + InliningAdvisorImpl *const Advisor; + const Function *const Caller; + const Function *const Callee; + const size_t EstimatedNativeSizeCallerBefore; + const size_t EstimatedNativeSizeCalleeBefore; + const bool Mandatory; + const int CallerIRSize; + const int CalleeIRSize; + const unsigned CallerAndCalleeEdges; + +private: + bool Recorded = false; +}; + +PendingInliningRecord::PendingInliningRecord(PendingInliningRecordImpl *Record) + : Impl(Record) {} + +void PendingInliningRecord::recordInlining(bool CalleeWasDeleted, + bool SiteWasInlined) { + Impl->recordInlining(CalleeWasDeleted, SiteWasInlined); +} + +PendingInliningRecordImpl::PendingInliningRecordImpl( + InliningAdvisorImpl *Advisor, const Function *Caller, + const Function *Callee, bool Mandatory) + : Advisor(Advisor), Caller(Caller), Callee(Callee), + EstimatedNativeSizeCallerBefore( + Advisor->isLogging() ? Advisor->getSizeEstimate(*Caller) : 0), + EstimatedNativeSizeCalleeBefore( + Advisor->isLogging() ? Advisor->getSizeEstimate(*Callee) : 0), + Mandatory(Mandatory), CallerIRSize(Advisor->getIRSize(*Caller)), + CalleeIRSize(Advisor->getIRSize(*Callee)), + CallerAndCalleeEdges(Advisor->getLocalCalls(*Caller) + + Advisor->getLocalCalls(*Callee)) {} + +void PendingInliningRecordImpl::recordInlining(bool CalleeWasDeleted, + bool SiteWasInlined) { + Recorded = true; + assert(!CalleeWasDeleted || SiteWasInlined); + if (SiteWasInlined) + Advisor->OnSuccessfulInlining(Caller); + + int IRSizeAfter = + Advisor->getIRSize(*Caller) + (CalleeWasDeleted ? 0 : CalleeIRSize); + Advisor->CurrentIRSize += IRSizeAfter - (CallerIRSize + CalleeIRSize); + if (Advisor->CurrentIRSize > SizeIncreaseThreshold * Advisor->InitialIRSize) + Advisor->ForceStop = true; + + unsigned NewCallerAndCalleeEdges = Advisor->getLocalCalls(*Caller); + if (CalleeWasDeleted) + --Advisor->NodeCount; + else + NewCallerAndCalleeEdges += Advisor->getLocalCalls(*Callee); + Advisor->EdgeCount -= CallerAndCalleeEdges; + Advisor->EdgeCount += NewCallerAndCalleeEdges; + assert(Advisor->CurrentIRSize >= 0 && Advisor->EdgeCount >= 0 && + Advisor->NodeCount >= 0); + + // If we don't train or produce a debug log, we don't want to compute native + // size. All adjustments are recorded, so we can return. + if (!Advisor->isLogging()) + return; + + int NativeDeltaSize = std::numeric_limits::max(); + if (!Advisor->ForceStop) { + int NativeSizeAfter = + Advisor->getSizeEstimate(*Caller) + + (CalleeWasDeleted ? 0 : EstimatedNativeSizeCalleeBefore); + NativeDeltaSize = NativeSizeAfter - (EstimatedNativeSizeCalleeBefore + + EstimatedNativeSizeCallerBefore); + } + if (!Mandatory) { + Advisor->ModelRunner->receiveReward(NativeDeltaSize); + } + + if (!DebugTrainingLog.empty()) { + InliningFeatures FeaturesCopy(FeatureList::NumberOfFeatures); + for (size_t I = 0; I < FeaturesCopy.size(); ++I) + FeaturesCopy[I] = Advisor->ModelRunner->get_feature(I); + Advisor->InliningRecords.emplace_back( + Caller->getName().str(), Callee->getName().str(), FeaturesCopy, + SiteWasInlined, NativeDeltaSize, Mandatory); + } + + // Don't adjust CurrentNativeSize with numeric_limits::max() + if (Advisor->ForceStop) + return; + + Advisor->CurrentNativeSize += NativeDeltaSize; + assert(Advisor->CurrentNativeSize >= 0); +} + +CallBase *getInlinableCS(Instruction &I) { + if (auto *CS = dyn_cast(&I)) + if (Function *Callee = CS->getCalledFunction()) { + if (!Callee->isDeclaration()) { + return CS; + } + } + return nullptr; +} + +InliningAdvisorImpl::InliningAdvisorImpl(Module &M, ModuleAnalysisManager &MAM) + : CG(new CallGraph(M)), + FAM(MAM.getResult(M).getManager()), + ModelRunner(std::make_unique(M.getContext())) { + for (const Function &F : CG->getModule()) { + if (F.isDeclaration()) + continue; + ++NodeCount; + EdgeCount += getLocalCalls(F); + } + + if (isLogging()) { + SizeEstimator = + std::make_unique(M.getContext(), FAM); + if (!SizeEstimator || !SizeEstimator->isValid()) { + SizeEstimator.reset(); + M.getContext().emitError("Could not initialize Size Estimator"); + return; + } + InitialNativeSize = getTotalSizeEstimate(); + CurrentNativeSize = InitialNativeSize; + } + InitialIRSize = getModuleSize(); + CurrentIRSize = InitialIRSize; + + for (auto I = scc_begin(CG.get()); !I.isAtEnd(); ++I) { + const std::vector &CGNodes = *I; + unsigned Level = 0; + for (auto CGNode : CGNodes) { + Function *F = CGNode->getFunction(); + if (F && !F->isDeclaration()) { + for (auto &I : instructions(F)) { + if (auto CS = getInlinableCS(I)) { + auto *Called = CS->getCalledFunction(); + auto Pos = FunctionLevels.find(Called); + // In bottom up traversal, an inlinable call is either in the + // same SCC, or to a function in a visited SCC. So not finding its + // level means we haven't visited it yet, meaning it's in this SCC. + if (Pos == FunctionLevels.end()) + continue; + Level = std::max(Level, Pos->second + 1); + } + } + } + } + for (auto CGNode : CGNodes) { + Function *F = CGNode->getFunction(); + if (F && !F->isDeclaration()) + FunctionLevels[F] = Level; + } + } +} + +size_t InliningAdvisorImpl::getModuleSize() const { + size_t Ret = 0; + for (auto &F : CG->getModule()) + if (!F.isDeclaration()) + Ret += getIRSize(F); + return Ret; +} + +InliningAdvisorImpl::~InliningAdvisorImpl() { + if (DebugTrainingLog.empty() || callGraph() == nullptr) + return; + + std::error_code ErrorCode; + raw_fd_ostream OutFile(DebugTrainingLog, ErrorCode); + + OutFile << "ModuleName: " << callGraph()->getModule().getName() << "\n" + << "InitialSize: " << InitialNativeSize << "\n"; + int DecisionID = 0; + for (const auto &Decision : InliningRecords) { + OutFile << "Inlining Decision: " << DecisionID++ << "\n"; + OutFile << "\t" + << "DeltaSize: " << Decision.NativeDeltaSize << "\n"; + OutFile << "\t" + << "Success: " << Decision.SiteWasInlined << "\n"; + OutFile << "\tCallerName: " << Decision.CallerName << "\n"; + OutFile << "\tCalleeName: " << Decision.CalleeName << "\n"; + OutFile << "\tFeatures: \n"; + + for (size_t I = 0; I < Decision.Features.size(); ++I) { + OutFile << "\t\t" << FeatureNameMap[I] << ": " << Decision.Features[I] + << "\n"; + } + } + OutFile << "FinalNodeCount: " << NodeCount << "\n"; + OutFile << "FinalEdgeCount: " << EdgeCount << "\n"; + OutFile << "FinalSize: " << getTotalSizeEstimate() << "\n"; +} + +PendingInliningRecord +InliningAdvisorImpl::shouldInline(const CallSiteInfo &CSI, + bool &AlternativeRecommendation, + bool Mandatory, int CostEstimate) { + if (ForceStop) { + AlternativeRecommendation = Mandatory; + return PendingInliningRecord(); + } + + auto &CS = *CSI.Call; + auto &Callee = *CS.getCalledFunction(); + auto &Caller = *CS.getCaller(); + + if (!Mandatory || !DebugTrainingLog.empty()) { + auto NrCtantParams = 0; + for (auto I = CS.arg_begin(), E = CS.arg_end(); I != E; ++I) { + NrCtantParams += (isa(*I)); + } + auto CallerBefore = getFuncDesc(Caller); + auto CalleeBefore = getFuncDesc(Callee); + + ModelRunner->set_feature(FeatureList::CalleeBasicBlockCount, + CalleeBefore.BasicBlockCount); + ModelRunner->set_feature(FeatureList::CallSiteHeight, CSI.Height); + ModelRunner->set_feature(FeatureList::NodeCount, NodeCount); + ModelRunner->set_feature(FeatureList::NrCtantParams, NrCtantParams); + ModelRunner->set_feature(FeatureList::CostEstimate, CostEstimate); + ModelRunner->set_feature(FeatureList::EdgeCount, EdgeCount); + ModelRunner->set_feature(FeatureList::CallerUsers, CallerBefore.Users); + ModelRunner->set_feature(FeatureList::CallerConditionallyExecutedBlocks, + CallerBefore.ConditionallyExecutedBlocks); + ModelRunner->set_feature(FeatureList::CallerBasicBlockCount, + CallerBefore.BasicBlockCount); + ModelRunner->set_feature(FeatureList::CalleeConditionallyExecutedBlocks, + CalleeBefore.ConditionallyExecutedBlocks); + ModelRunner->set_feature(FeatureList::CalleeUsers, CalleeBefore.Users); + } + + PendingInliningRecord Ret( + new PendingInliningRecordImpl(this, &Caller, &Callee, Mandatory)); + + if (Mandatory) + return Ret; + + AlternativeRecommendation = static_cast( + ModelRunner->run(static_cast(AlternativeRecommendation))); + + return Ret; +} + +size_t InliningAdvisorImpl::getSizeEstimate(const Function &F) const { + return NativeSizeEstimates.getOrInsert(&F, [this](const Function *F) { + return SizeEstimator->getSizeEstimate(*F); + }); +} + +unsigned InliningAdvisorImpl::getLocalCalls(const Function &F) const { + return NrOfCalls.getOrInsert(&F, [this](const Function *F) { + unsigned Ret = 0; + for (const BasicBlock &BB : *F) { + for (const Instruction &I : BB) { + if (auto CS = dyn_cast(&I)) { + const Function *Callee = CS->getCalledFunction(); + if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) + ++Ret; + } + } + } + return Ret; + }); +} + +FuncDesc InliningAdvisorImpl::getFuncDesc(const Function &F) const { + return FuncDescs.getOrInsert(&F, [this](const Function *F) { + FuncDesc Ret; + if (!DebugTrainingLog.empty()) + Ret.Name = F->getName().str(); + if (DeletedFunctions.count(F) > 0) { + assert(getNrOfUsers(*F) == 0); + return Ret; + } + Ret.Users = getNrOfUsers(*F); + for (const auto &BB : *F) { + ++Ret.BasicBlockCount; + if (const auto *BI = dyn_cast(BB.getTerminator())) { + if (BI->isConditional()) { + Ret.ConditionallyExecutedBlocks += BI->getNumSuccessors(); + } + } else if (const auto *SI = dyn_cast(BB.getTerminator())) { + Ret.ConditionallyExecutedBlocks += + (SI->getNumCases() + (nullptr != SI->getDefaultDest())); + } + } + return Ret; + }); +} + +size_t InliningAdvisorImpl::getTotalSizeEstimate() const { + size_t Ret = 0; + for (auto const &F : CG->getModule()) { + if (F.isDeclaration()) + continue; + if (DeletedFunctions.count(&F) > 0) + continue; + Ret += getSizeEstimate(F); + } + return Ret; +} + +// Implement pImpl for InliningAdvisor and PendingInliningRecord. +InliningAdvisor::InliningAdvisor(Module &M, ModuleAnalysisManager &MAM) + : Impl(std::make_unique(M, MAM)) {} + +PendingInliningRecord +InliningAdvisor::shouldInline(CallBase *CB, bool &AlternateRecommendation, + bool Mandatory, int CostEstimate) { + return PendingInliningRecord(Impl->shouldInline( + CallSiteInfo(CB, Impl->FunctionLevels[CB->getCaller()]), + AlternateRecommendation, Mandatory, CostEstimate)); +} +void InliningAdvisor::OnPassEntry() { Impl->OnPassEntry(); } +void InliningAdvisor::OnPassExit() { Impl->OnPassExit(); } +void InliningAdvisor::OnSuccessfulInlining(const Function *F) { + Impl->OnSuccessfulInlining(F); +} +void InliningAdvisor::OnAllInliningCompleted() { + Impl->OnAllInliningCompleted(); +} +void InliningAdvisor::OnFunctionDeleted(Function *F) { + Impl->OnFunctionDeleted(F); +} + +PendingInliningRecord::~PendingInliningRecord() = default; +PendingInliningRecord::PendingInliningRecord() = default; +PendingInliningRecord::PendingInliningRecord(PendingInliningRecord &&) = + default; +PendingInliningRecord & +PendingInliningRecord::operator=(PendingInliningRecord &&) = default; + +InliningAdvisor::~InliningAdvisor() = default; +InliningAdvisor::InliningAdvisor(InliningAdvisor &&) = default; +} // namespace llvm Index: llvm/lib/Analysis/ML/InliningModelFeatureMaps.h =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/InliningModelFeatureMaps.h @@ -0,0 +1,58 @@ +//===- InliningModelFeatureMaps.h - common model runner defs ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +#ifndef LLVM_INLININGMODELFEATUREMAPS_H +#define LLVM_INLININGMODELFEATUREMAPS_H + +#include + +namespace llvm { +enum FeatureList { + CalleeBasicBlockCount = 0, + CallSiteHeight, + NodeCount, + NrCtantParams, + CostEstimate, + EdgeCount, + CallerUsers, + CallerConditionallyExecutedBlocks, + CallerBasicBlockCount, + CalleeConditionallyExecutedBlocks, + CalleeUsers, + + // Last value tracks the total number of values in the enum - it's not an + // actual feature. + NumberOfFeatures +}; + +static const std::array + FeatureNameMap{ + /*FeatureList::CalleeBasicBlockCount*/ "callee_basic_block_count", + /*FeatureList::CallSiteHeight*/ "callsite_height", + /*FeatureList::NodeCount*/ "node_count", + /*FeatureList::NrCtantParams*/ "nr_ctant_params", + /*FeatureList::CostEstimate*/ "cost_estimate", + /*FeatureList::EdgeCount*/ "edge_count", + /*FeatureList::CallerUsers*/ "caller_users", + /*FeatureList::CallerConditionallyExecutedBlocks*/ + "caller_conditionally_executed_blocks", + /*FeatureList::CallerBasicBlockCount*/ "caller_basic_block_count", + /*FeatureList::CalleeConditionallyExecutedBlocks*/ + "callee_conditionally_executed_blocks", + /*FeatureList::CalleeUsers*/ "callee_users", + }; + +static const char *const DecisionName = "inlining_decision"; +static const char *const DefaultDecisionName = "inlining_default"; +static const char *const RewardName = "delta_size"; + +using InliningFeatures = std::vector; + +} // namespace llvm +#endif // LLVM_INLININGMODELFEATUREMAPS_H Index: llvm/lib/Analysis/ML/InliningModelRunnerProduction.h =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/InliningModelRunnerProduction.h @@ -0,0 +1,122 @@ +//===- InliningModelRunnerProduction.h - production ML runner ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +#ifndef LLVM_INLININGMODELRUNNERPRODUCTION_H +#define LLVM_INLININGMODELRUNNERPRODUCTION_H + +#include "InlinerSizeModel.h" +#include "InliningModelFeatureMaps.h" + +#include "llvm/IR/LLVMContext.h" + +#include +#include + +namespace llvm { + +static const char *const FeedPrefix = "feed_"; +static const char *const FetchPrefix = "fetch_"; + +/// InliningModelRunner - production mode implementation. It uses a AOT-compiled +/// SavedModel for efficient execution. +class InliningModelRunner { +public: + InliningModelRunner(LLVMContext &Ctx); + + ~InliningModelRunner(); + + int64_t run(int64_t DefaultDecision); + + void receiveReward(int64_t Reward); + + // Disallows copy and assign. + InliningModelRunner(const InliningModelRunner &) = delete; + InliningModelRunner &operator=(const InliningModelRunner &) = delete; + + void set_feature(int Index, int64_t Value); + int64_t get_feature(int Index) const; + bool isLogging() const { return false; } + +private: + bool isUsingInference() const; + + LLVMContext &Ctx; + + std::vector FeatureIndices; + int32_t ResultIndex = -1; + std::unique_ptr CompiledModel; +}; + +class FunctionSizeEstimator { +public: + FunctionSizeEstimator(LLVMContext &Ctx, FunctionAnalysisManager &FAM) {} + // Just return the number of blocks. This is interesting for debugging only. + size_t getSizeEstimate(const Function &F) { return F.size(); } + bool isValid() const { return true; } +}; + +InliningModelRunner::InliningModelRunner(LLVMContext &Ctx) + : Ctx(Ctx), CompiledModel(std::make_unique()) { + if (!isUsingInference()) + return; + + // TODO(yundi): CompiledModel->num_args() becomes 11 + 6 = 17 instead of + // 11 after adding loadable weights functionality in AOT, disable this check + // for now. + // assert(CompiledModel->num_args() == FeatureList::NumberOfFeatures && + // "Features in inlining model does not match FeatureNameMap"); + FeatureIndices.reserve(FeatureList::NumberOfFeatures); + + for (int I = 0; I < FeatureList::NumberOfFeatures; ++I) { + const int Index = + CompiledModel->LookupArgIndex(FeedPrefix + FeatureNameMap[I]); + if (Index < 0) { + Ctx.emitError("Cannot find Feature in inlining model"); + CompiledModel.reset(); + return; + } + FeatureIndices[I] = Index; + } + + ResultIndex = + CompiledModel->LookupResultIndex(std::string(FetchPrefix) + DecisionName); + if (ResultIndex < 0) { + Ctx.emitError("Cannot find DecisionName in inlining model"); + CompiledModel.reset(); + } +} + +bool InliningModelRunner::isUsingInference() const { return !!CompiledModel; } + +int64_t InliningModelRunner::get_feature(int Index) const { + return *static_cast( + CompiledModel->arg_data(FeatureIndices[Index])); +} + +void InliningModelRunner::set_feature(int Index, int64_t Value) { + *static_cast(CompiledModel->arg_data(FeatureIndices[Index])) = + Value; +} + +int64_t InliningModelRunner::run(int64_t DefaultDecision) { + int64_t Ret = DefaultDecision; + if (isUsingInference()) { + CompiledModel->Run(); + Ret = *static_cast(CompiledModel->result_data(ResultIndex)); + } + return Ret; +} + +InliningModelRunner::~InliningModelRunner() {} + +void InliningModelRunner::receiveReward(int64_t) {} + +} // namespace llvm + +#endif // LLVM_INLININGMODELRUNNERPRODUCTION_H Index: llvm/lib/Analysis/ML/InliningModelRunnerTraining.h =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/InliningModelRunnerTraining.h @@ -0,0 +1,263 @@ +//===- InliningModelRunnerProduction.h -training mode ML runner -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +#ifndef LLVM_INLININGMODELRUNNERTRAINING_H +#define LLVM_INLININGMODELRUNNERTRAINING_H + +#include "InliningModelFeatureMaps.h" +#include "TFUtils.h" + +#include "llvm/Analysis/ML/IRToNativeSizeLearning.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ManagedStatic.h" + +#include "tensorflow/c/c_api.h" + +using namespace llvm; + +static cl::opt + TrainingLog("training-log", cl::Hidden, + cl::desc("Path where the inlining log is saved.")); + +static cl::opt TFTrainedModelPath( + "ml-inliner-trained-model", cl::Hidden, + cl::desc( + "Path to saved model to use as policy during this training session.")); + +static cl::opt TFIR2NativeModelPath( + "ml-inliner-ir2native-model", cl::Hidden, + cl::desc("Path to saved model evaluating native size from IR.")); + +static cl::opt TFFeedPrefix("ml-inliner-trained-model-feed-prefix", + cl::Hidden, cl::init("action_"), + cl::desc("Prefix for feature names.")); + +static cl::opt TFDecisionName( + "ml-inliner-trained-model-decision-name", cl::Hidden, + cl::init("StatefulPartitionedCall"), + cl::desc("Name of the graph operation representing the decision.")); + +namespace llvm { +/// InliningModelRunner - training mode implementation. It uses TF C APIs to +/// dynamically load and evaluate a TF SavedModel. Runtime performance is +/// sacrificed for ease of use while training. +class InliningModelRunner { +public: + InliningModelRunner(LLVMContext &Ctx); + + ~InliningModelRunner(); + + int64_t run(int64_t DefaultDecision); + + void receiveReward(int64_t Reward); + + // Disallows copy and assign. + InliningModelRunner(const InliningModelRunner &) = delete; + InliningModelRunner &operator=(const InliningModelRunner &) = delete; + + void set_feature(int Index, int64_t Value); + int64_t get_feature(int Index) const; + bool isLogging() const { return !TrainingLog.empty(); } + +private: + bool isUsingInference() const; + + std::vector LoggedFeatures; + std::vector LoggedDefaultDecisions; + std::vector LoggedDecisions; + std::vector LoggedRewards; + + std::unique_ptr Evaluator; + + InliningFeatures FeatureStorage; + + // The training framework needs some additional features, that just need to be + // set to 0. + struct TensorSpec { + std::string Name; + TF_DataType Type; + }; + + const std::vector TrainingOnlyFeatures{ + {"inlining_default", TF_INT64}, + {"discount", TF_FLOAT}, + {"reward", TF_FLOAT}, + {"step_type", TF_INT32}}; +}; + +class FunctionSizeEstimator { +public: + FunctionSizeEstimator(LLVMContext &Ctx, FunctionAnalysisManager &FAM); + size_t getSizeEstimate(const Function &F); + bool isValid() const { return Evaluator && Evaluator->isValid(); } + +private: + std::unique_ptr Evaluator; + FunctionAnalysisManager &FAM; +}; + +InliningModelRunner::InliningModelRunner(LLVMContext &Ctx) { + if (!TFTrainedModelPath.empty()) { + std::vector InputNames; + std::vector OutputNames; + for (auto I = 0; I < FeatureList::NumberOfFeatures; ++I) + InputNames.push_back(TFFeedPrefix + FeatureNameMap[I]); + for (size_t I = 0; I < TrainingOnlyFeatures.size(); ++I) + InputNames.push_back(TFFeedPrefix + TrainingOnlyFeatures[I].Name); + OutputNames.push_back(TFDecisionName); + + Evaluator = std::make_unique(Ctx, TFTrainedModelPath, + InputNames, OutputNames); + if (!Evaluator || !Evaluator->isValid()) { + Ctx.emitError("Failed to create inliner saved model evaluator"); + Evaluator.reset(); + return; + } + } else { + FeatureStorage.resize(FeatureList::NumberOfFeatures); + } + + if (!TrainingLog.empty()) { + for (auto I = 0; I < FeatureList::NumberOfFeatures; ++I) { + LoggedFeatures.push_back(InliningFeatures()); + } + } + if (!isUsingInference()) + return; + + static const std::vector Dim{1}; + + size_t InputIndex = 0; + for (; InputIndex < FeatureList::NumberOfFeatures; ++InputIndex) { + Evaluator->initInput(InputIndex, TF_INT64, Dim); + } + + for (; InputIndex < Evaluator->getInput().size(); ++InputIndex) { + auto TFType = + TrainingOnlyFeatures[InputIndex - FeatureList::NumberOfFeatures].Type; + Evaluator->initInput(InputIndex, TFType, Dim); + } +} + +bool InliningModelRunner::isUsingInference() const { return !!Evaluator; } + +int64_t InliningModelRunner::run(int64_t DefaultDecision) { + int64_t Decision = DefaultDecision; + if (isUsingInference()) { + std::vector Output{nullptr}; + if (!Evaluator->evaluate(Output)) + return DefaultDecision; + Decision = *(static_cast(TF_TensorData(Output[0]))); + TF_DeleteTensor(Output[0]); + } + if (!TrainingLog.empty()) { + for (auto I = 0; I < FeatureList::NumberOfFeatures; ++I) { + LoggedFeatures[I].push_back(get_feature(I)); + } + LoggedDefaultDecisions.push_back(DefaultDecision); + LoggedDecisions.push_back(Decision); + } + return Decision; +} + +int64_t InliningModelRunner::get_feature(int Index) const { + if (isUsingInference()) + return *( + static_cast(TF_TensorData(Evaluator->getInput()[Index]))); + return FeatureStorage[Index]; +} + +void InliningModelRunner::set_feature(int Index, int64_t Value) { + if (isUsingInference()) + *(static_cast(TF_TensorData(Evaluator->getInput()[Index]))) = + Value; + else + FeatureStorage[Index] = Value; +} + +#define LOG_DUMP(Name, Feature) \ + do { \ + OutFile << " feature_list: {\n"; \ + OutFile << " key: " \ + << "\"" << Name << "\"" \ + << " \n"; \ + OutFile << " value: {\n"; \ + for (const auto &feature : Feature) { \ + OutFile << " feature: {\n"; \ + OutFile << " int64_list: {\n"; \ + OutFile << " value: [ " << feature << " ]\n"; \ + OutFile << " }\n"; \ + OutFile << " }\n"; \ + } \ + OutFile << " }\n"; \ + OutFile << " }\n"; \ + } while (0); + +InliningModelRunner::~InliningModelRunner() { + if (!TrainingLog.empty()) { + std::error_code ErrorCode; + raw_fd_ostream OutFile(TrainingLog, ErrorCode); + + if (!LoggedDefaultDecisions.empty()) { + OutFile << "feature_lists: {\n"; + + for (size_t i = 0; i < LoggedFeatures.size(); i++) { + LOG_DUMP(FeatureNameMap.at(static_cast(i)), + LoggedFeatures[i]); + } + LOG_DUMP(DefaultDecisionName, LoggedDefaultDecisions); + LOG_DUMP(DecisionName, LoggedDecisions); + LOG_DUMP(RewardName, LoggedRewards); + + OutFile << "}\n"; + } + } +} + +#undef LOG_DUMP + +void InliningModelRunner::receiveReward(int64_t Reward) { + if (!TrainingLog.empty()) { + LoggedRewards.push_back(Reward); + } +} + +FunctionSizeEstimator::FunctionSizeEstimator(LLVMContext &Ctx, + FunctionAnalysisManager &FAM) + : FAM(FAM) { + std::vector InputNames{"serving_default_input_1"}; + std::vector OutputName{"StatefulPartitionedCall"}; + Evaluator = std::make_unique( + Ctx, TFIR2NativeModelPath.getValue().c_str(), InputNames, OutputName); + if (!Evaluator || !Evaluator->isValid()) + Evaluator.reset(); + static const std::vector Dim{ + 1, IRToNativeSizeLearning::FunctionFeatures::FeatureCount}; + + Evaluator->initInput(0, TF_INT32, Dim); +} + +size_t FunctionSizeEstimator::getSizeEstimate(const Function &F) { + auto Features = IRToNativeSizeLearning::getFunctionFeatures( + const_cast(F), FAM); + int32_t *V = static_cast(TF_TensorData(Evaluator->getInput()[0])); + Features.FillTensor(V); + std::vector Output{nullptr}; + if (!Evaluator->evaluate(Output)) + return 0; + float Ret = *(static_cast(TF_TensorData(Output[0]))); + TF_DeleteTensor(Output[0]); + if (Ret < 0) + Ret = 0.0; + return static_cast(Ret); +} +} // namespace llvm + +#endif // LLVM_INLININGMODELRUNNERTRAINING_H Index: llvm/lib/Analysis/ML/TFUtils.h =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/TFUtils.h @@ -0,0 +1,74 @@ +//===- TFUtils.h - utilities for tensorflow C API ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +#ifndef LLVM_TFUTILS_H +#define LLVM_TFUTILS_H + +#include "tensorflow/c/c_api.h" +#include "llvm/IR/LLVMContext.h" + +#include +#include + +namespace llvm { + +bool ensureInitTF(); + +using TFGraphPtr = std::unique_ptr; +using TFSessionOptionsPtr = + std::unique_ptr; +using TFStatusPtr = std::unique_ptr; + +TFGraphPtr createTFGraph(); +TFStatusPtr createTFStatus(); +TFSessionOptionsPtr createTFSessionOptions(); + +/// Load a SavedModel, find the given inputs and outputs, and setup storage +/// for input tensors. The user is responsible for allocating, initializing, +/// setting values, and deallocating the input tensors, and for deallocating the +/// output tensors. +class TFModelEvaluator final { +public: + TFModelEvaluator(LLVMContext &Ctx, StringRef SavedModelPath, + const std::vector &InputNames, + const std::vector &OutputNames, + const char *Tags = "serve"); + ~TFModelEvaluator(); + TFModelEvaluator(const TFModelEvaluator &) = delete; + TFModelEvaluator(TFModelEvaluator &&) = delete; + + /// Evaluate the model, assuming it is valid. Returns false if the evaluation + /// fails or the model is invalid, true otherwise. The inputs are assumed to + /// have been already provided via getInput(). When returning false, it also + /// marks the object invalid. + bool evaluate(std::vector &Output); + + /// Provides access to the input vector. It is already dimensioned correctly, + /// but the values need to be allocated by the user. + std::vector &getInput() { return Input; } + + /// Returns true if the tensorflow model was loaded successfully, false + /// otherwise. + bool isValid() const { return !!Session; } + void initInput(int Index, TF_DataType Type, + const std::vector &Dimensions); + +private: + LLVMContext &Ctx; + TF_Session *Session = nullptr; + TFGraphPtr Graph; + TFSessionOptionsPtr Options; + std::vector InputFeed; + std::vector Input; + std::vector OutputFeed; + void DeleteSession(); + bool CheckReportAndReset(const TF_Output &Output, StringRef Name); +}; +} // namespace llvm + +#endif // LLVM_TFUTILS_H Index: llvm/lib/Analysis/ML/TFUtils.cpp =================================================================== --- /dev/null +++ llvm/lib/Analysis/ML/TFUtils.cpp @@ -0,0 +1,138 @@ +//===- TFUtils.cpp - tensorflow evaluation utilities ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements utilities for interfacing with tensorflow C APIs. +// +//===----------------------------------------------------------------------===// + +#include "TFUtils.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/raw_ostream.h" + +#include "tensorflow/c/c_api_experimental.h" + +#include + +namespace llvm { + +struct TFInitializer { + TFInitializer() { + assert(!isInitialized && "TFInitialized should be called only once"); + int Argc = 1; + const char *Name = ""; + const char **NamePtr = &Name; + TF_InitMain(Name, &Argc, const_cast(&NamePtr)); + isInitialized = true; + } + bool isInitialized = false; +}; + +llvm::ManagedStatic TFInitializer; + +bool ensureInitTF() { return TFInitializer->isInitialized; } + +TFGraphPtr createTFGraph() { + return TFGraphPtr(TF_NewGraph(), &TF_DeleteGraph); +} + +TFStatusPtr createTFStatus() { + return TFStatusPtr(TF_NewStatus(), &TF_DeleteStatus); +} +TFSessionOptionsPtr createTFSessionOptions() { + return TFSessionOptionsPtr(TF_NewSessionOptions(), &TF_DeleteSessionOptions); +} + +TFModelEvaluator::TFModelEvaluator(LLVMContext &Ctx, StringRef SavedModelPath, + const std::vector &InputNames, + const std::vector &OutputNames, + const char *Tags) + : Ctx(Ctx), Graph(createTFGraph()), Options(createTFSessionOptions()), + InputFeed(InputNames.size()), Input(InputNames.size()), + OutputFeed(OutputNames.size()) { + if (!ensureInitTF()) { + Ctx.emitError("Tensorflow should have been initialized"); + return; + } + auto Status = createTFStatus(); + + Session = TF_LoadSessionFromSavedModel(Options.get(), nullptr, + SavedModelPath.str().c_str(), &Tags, 1, + Graph.get(), nullptr, Status.get()); + if (TF_GetCode(Status.get()) != TF_Code::TF_OK) { + Ctx.emitError(TF_Message(Status.get())); + DeleteSession(); + } + for (size_t I = 0; I < InputNames.size(); ++I) { + InputFeed[I] = { + TF_GraphOperationByName(Graph.get(), (InputNames[I]).c_str()), 0}; + if (!CheckReportAndReset(InputFeed[I], InputNames[I])) + return; + } + for (size_t I = 0; I < OutputNames.size(); ++I) { + OutputFeed[I] = { + TF_GraphOperationByName(Graph.get(), (OutputNames[I]).c_str()), 0}; + if (!CheckReportAndReset(OutputFeed[I], OutputNames[I])) + return; + } +} + +TFModelEvaluator::~TFModelEvaluator() { + for (auto *T : Input) { + TF_DeleteTensor(T); + } + DeleteSession(); +} + +bool TFModelEvaluator::CheckReportAndReset(const TF_Output &Output, + StringRef Name) { + if (Output.oper) + return true; + Ctx.emitError("Could not find TF_Output named: " + Name); + DeleteSession(); + return false; +} + +void TFModelEvaluator::DeleteSession() { + if (Session == nullptr) + return; + auto Status = createTFStatus(); + TF_DeleteSession(Session, Status.get()); + Session = nullptr; + if (TF_GetCode(Status.get()) != TF_Code::TF_OK) + Ctx.emitError("Could not delete TF session"); +} + +bool TFModelEvaluator::evaluate(std::vector &Output) { + if (!isValid()) + return false; + auto Status = createTFStatus(); + TF_SessionRun(Session, nullptr, InputFeed.data(), Input.data(), Input.size(), + OutputFeed.data(), Output.data(), Output.size(), nullptr, 0, + nullptr, Status.get()); + if (TF_GetCode(Status.get()) != TF_Code::TF_OK) { + Ctx.emitError(TF_Message(Status.get())); + return false; + } + return true; +} + +void TFModelEvaluator::initInput(int Index, TF_DataType Type, + const std::vector &Dimensions) { + int64_t TotalSize = TF_DataTypeSize(Type); + for (auto &D : Dimensions) + TotalSize *= D; + + Input[Index] = + TF_AllocateTensor(Type, Dimensions.data(), Dimensions.size(), TotalSize); + std::memset(TF_TensorData(Input[Index]), 0, TotalSize); +} + +} // namespace llvm Index: llvm/lib/CMakeLists.txt =================================================================== --- llvm/lib/CMakeLists.txt +++ llvm/lib/CMakeLists.txt @@ -34,3 +34,4 @@ add_subdirectory(Testing) endif() add_subdirectory(WindowsManifest) + Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -39,6 +39,7 @@ #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopNestAnalysis.h" +#include "llvm/Analysis/ML/InliningAdvisor.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" @@ -215,6 +216,15 @@ "enable-npm-gvn-hoist", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); +static cl::opt EnableMLInliner( + "enable-ml-inliner", cl::init(false), cl::Hidden, + cl::desc("Enable ML policy for inliner. Currently trained for -Oz only")); + +cl::opt PerformMandatoryInliningsFirst( + "mandatory-inlinings-first", cl::init(false), + cl::desc("Perform all mandatory (always-inline) inlinings first, for the " + "whole module.")); + static cl::opt EnableGVNSink( "enable-npm-gvn-sink", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); @@ -690,10 +700,95 @@ return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); } -ModulePassManager -PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, - ThinLTOPhase Phase, - bool DebugLogging) { +ModulePassManager PassBuilder::buildInlinerPipeline(OptimizationLevel Level, + ThinLTOPhase Phase, + bool DebugLogging) { + ModulePassManager MPM(DebugLogging); + + if (PerformMandatoryInliningsFirst) { + CGSCCPassManager AlwaysInliningPipeline(DebugLogging); + AlwaysInliningPipeline.addPass( + InlinerPass(getInlineParamsFromOptLevel(Level), true)); + AlwaysInliningPipeline.addPass(AttributorCGSCCPass()); + + if (PTO.Coroutines) + AlwaysInliningPipeline.addPass(CoroSplitPass()); + + AlwaysInliningPipeline.addPass(PostOrderFunctionAttrsPass()); + AlwaysInliningPipeline.addPass(createCGSCCToFunctionPassAdaptor( + buildFunctionSimplificationPipeline(Level, Phase, DebugLogging))); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( + std::move(AlwaysInliningPipeline))); + } + + // Now begin the main postorder CGSCC pipeline. + // FIXME: The current CGSCC pipeline has its origins in the legacy pass + // manager and trying to emulate its precise behavior. Much of this doesn't + // make a lot of sense and we should revisit the core CGSCC structure. + CGSCCPassManager MainCGPipeline(DebugLogging); + + // Note: historically, the PruneEH pass was run first to deduce nounwind and + // generally clean up exception handling overhead. It isn't clear this is + // valuable as the inliner doesn't currently care whether it is inlining an + // invoke or a call. + + // Run the inliner first. The theory is that we are walking bottom-up and so + // the callees have already been fully optimized, and we want to inline them + // into the callers so that our optimizations can reflect that. + // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO + // because it makes profile annotation in the backend inaccurate. + InlineParams IP = getInlineParamsFromOptLevel(Level); + if (Phase == ThinLTOPhase::PreLink && PGOOpt && + PGOOpt->Action == PGOOptions::SampleUse) + IP.HotCallSiteThreshold = 0; + + if (EnableMLInliner) { + MPM.addPass(RequireAnalysisPass()); + } + + MainCGPipeline.addPass(InlinerPass(IP)); + if (!DisableAttributor) + MainCGPipeline.addPass(AttributorCGSCCPass()); + + if (PTO.Coroutines) + MainCGPipeline.addPass(CoroSplitPass()); + + // Now deduce any function attributes based in the current code. + MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); + + // When at O3 add argument promotion to the pass pipeline. + // FIXME: It isn't at all clear why this should be limited to O3. + if (Level == OptimizationLevel::O3) + MainCGPipeline.addPass(ArgumentPromotionPass()); + + // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if + // there are no OpenMP runtime calls present in the module. + if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) + MainCGPipeline.addPass(OpenMPOptPass()); + + // Lastly, add the core function simplification pipeline nested inside the + // CGSCC walk. + MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( + buildFunctionSimplificationPipeline(Level, Phase, DebugLogging))); + + for (auto &C : CGSCCOptimizerLateEPCallbacks) + C(MainCGPipeline, Level); + + // We wrap the CGSCC pipeline in a devirtualization repeater. This will try + // to detect when we devirtualize indirect calls and iterate the SCC passes + // in that case to try and catch knock-on inlining or function attrs + // opportunities. Then we add it to the module pipeline by walking the SCCs + // in postorder (or bottom-up). + MPM.addPass( + createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass( + std::move(MainCGPipeline), MaxDevirtIterations))); + if (EnableMLInliner) + MPM.addPass(InliningAdvisorCleanupPass()); + return MPM; +} + +ModulePassManager PassBuilder::buildModuleSimplificationPipeline( + OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) { ModulePassManager MPM(DebugLogging); bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); @@ -836,58 +931,7 @@ // make a lot of sense and we should revisit the core CGSCC structure. CGSCCPassManager MainCGPipeline(DebugLogging); - // Note: historically, the PruneEH pass was run first to deduce nounwind and - // generally clean up exception handling overhead. It isn't clear this is - // valuable as the inliner doesn't currently care whether it is inlining an - // invoke or a call. - - // Run the inliner first. The theory is that we are walking bottom-up and so - // the callees have already been fully optimized, and we want to inline them - // into the callers so that our optimizations can reflect that. - // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO - // because it makes profile annotation in the backend inaccurate. - InlineParams IP = getInlineParamsFromOptLevel(Level); - if (Phase == ThinLTOPhase::PreLink && PGOOpt && - PGOOpt->Action == PGOOptions::SampleUse) - IP.HotCallSiteThreshold = 0; - MainCGPipeline.addPass(InlinerPass(IP)); - - if (!DisableAttributor) - MainCGPipeline.addPass(AttributorCGSCCPass()); - - if (PTO.Coroutines) - MainCGPipeline.addPass(CoroSplitPass()); - - // Now deduce any function attributes based in the current code. - MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); - - // When at O3 add argument promotion to the pass pipeline. - // FIXME: It isn't at all clear why this should be limited to O3. - if (Level == OptimizationLevel::O3) - MainCGPipeline.addPass(ArgumentPromotionPass()); - - // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if - // there are no OpenMP runtime calls present in the module. - if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) - MainCGPipeline.addPass(OpenMPOptPass()); - - // Lastly, add the core function simplification pipeline nested inside the - // CGSCC walk. - MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( - buildFunctionSimplificationPipeline(Level, Phase, DebugLogging))); - - for (auto &C : CGSCCOptimizerLateEPCallbacks) - C(MainCGPipeline, Level); - - // We wrap the CGSCC pipeline in a devirtualization repeater. This will try - // to detect when we devirtualize indirect calls and iterate the SCC passes - // in that case to try and catch knock-on inlining or function attrs - // opportunities. Then we add it to the module pipeline by walking the SCCs - // in postorder (or bottom-up). - MPM.addPass( - createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass( - std::move(MainCGPipeline), MaxDevirtIterations))); - + MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging)); return MPM; } Index: llvm/lib/Passes/PassRegistry.def =================================================================== --- llvm/lib/Passes/PassRegistry.def +++ llvm/lib/Passes/PassRegistry.def @@ -27,6 +27,7 @@ MODULE_ANALYSIS("verify", VerifierAnalysis()) MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC)) MODULE_ANALYSIS("asan-globals-md", ASanGlobalsMetadataAnalysis()) +MODULE_ANALYSIS("inlining-advisor", InliningAdvisorAnalysis()) #ifndef MODULE_ALIAS_ANALYSIS #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS) \ @@ -65,6 +66,8 @@ MODULE_PASS("ipsccp", IPSCCPPass()) MODULE_PASS("lowertypetests", LowerTypeTestsPass(nullptr, nullptr)) MODULE_PASS("mergefunc", MergeFunctionsPass()) +MODULE_PASS("scc-oz-module-inliner", + buildInlinerPipeline(OptimizationLevel::Oz, ThinLTOPhase::None, DebugLogging)) MODULE_PASS("name-anon-globals", NameAnonGlobalPass()) MODULE_PASS("no-op-module", NoOpModulePass()) MODULE_PASS("partial-inliner", PartialInlinerPass()) Index: llvm/lib/Transforms/IPO/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/IPO/CMakeLists.txt +++ llvm/lib/Transforms/IPO/CMakeLists.txt @@ -1,3 +1,8 @@ +if (${LLVM_ENABLE_ML_HEURISTICS}) + set(LLVM_IPO_ML_DEPS MLHeuristics) + add_subdirectory(ML) +endif() + add_llvm_component_library(LLVMipo AlwaysInliner.cpp ArgumentPromotion.cpp @@ -44,4 +49,7 @@ DEPENDS intrinsics_gen + ${LLVM_IPO_ML_DEPS} + + LINK_LIBS ${LLVM_IPO_ML_DEPS} ) Index: llvm/lib/Transforms/IPO/Inliner.cpp =================================================================== --- llvm/lib/Transforms/IPO/Inliner.cpp +++ llvm/lib/Transforms/IPO/Inliner.cpp @@ -30,12 +30,11 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LazyCallGraph.h" +#include "llvm/Analysis/ML/InliningAdvisor.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" @@ -58,8 +57,10 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include @@ -159,9 +160,9 @@ /// *actually make it to the backend*, which is really what we want. /// /// Because we don't have this information, we do this simple and useful hack. -static void mergeInlinedArrayAllocas( - Function *Caller, InlineFunctionInfo &IFI, - InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory) { +static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI, + InlinedArrayAllocasTy &InlinedArrayAllocas, + int InlineHistory) { SmallPtrSet UsedAllocas; // When processing our SCC, check to see if CS was inlined from some other @@ -897,6 +898,24 @@ assert(InitialC.size() > 0 && "Cannot handle an empty SCC!"); Module &M = *InitialC.begin()->getFunction().getParent(); ProfileSummaryInfo *PSI = MAM.getCachedResult(M); + InliningAdvisor *Advisor = MAM.getCachedResult(M); + assert(!MandatoryOnly || !Advisor); + if (Advisor) + Advisor->OnPassEntry(); + + // Avoid subtle bugs due to alternative exits from this method - if we have + // an advisor, ensure it is always informed when we're done with a scc. + class AdvisorExitCapture final { + InliningAdvisor *const Advisor; + + public: + AdvisorExitCapture(InliningAdvisor *A) : Advisor(A) {} + ~AdvisorExitCapture() { + if (Advisor) + Advisor->OnPassExit(); + } + }; + AdvisorExitCapture Capturer(Advisor); if (!ImportedFunctionsStats && InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) { @@ -1010,8 +1029,7 @@ // node however because those functions aren't going to be mutated by this // pass. FunctionAnalysisManager &FAM = - AM.getResult(*C, CG) - .getManager(); + AM.getResult(*C, CG).getManager(); // Get the remarks emission analysis for the caller. auto &ORE = FAM.getResult(F); @@ -1068,17 +1086,58 @@ continue; } - Optional OIC = shouldInline(CS, GetInlineCost, ORE); - // Check whether we want to inline this callsite. - if (!OIC.hasValue()) { - setInlineRemark(CS, "deferred"); + CallBase *CB = dyn_cast(CS.getInstruction()); + auto TrivialDecision = llvm::getTrivialInliningDecision( + *CB, CS.getCalledFunction(), FAM.getResult(Callee), + GetAssumptionCache, GetTLI); + + if (Advisor && + ((TrivialDecision.hasValue() && TrivialDecision->isNever()) || + &Callee == &F)) + continue; + if (MandatoryOnly && + (!TrivialDecision.hasValue() || !TrivialDecision->isAlways())) continue; - } - if (!OIC.getValue()) { - // shouldInline() call returned a negative inline cost that explains - // why this callsite should not be inlined. - setInlineRemark(CS, inlineCostStr(*OIC)); + // TODO(mtrofin): this replicates the already calculated + // TrivialDecision part when we don't do inference. Refactor to avoid. + const bool Mandatory = + TrivialDecision.hasValue() && TrivialDecision->isAlways(); + + // TODO(mtrofin): no need to compute OIC if Advisor is doing inference and + // no logging. + Optional OIC = shouldInline(CS, GetInlineCost, ORE); + assert(!Mandatory || (OIC.hasValue() && OIC.getValue())); + bool ShouldInline = Mandatory || (OIC.hasValue() && OIC.getValue()); + // A deep analysis of the callsite may reveal blocking reasons for not + // inlining, such as VarArgs, or large stack sizes. Stop in that case, as + // inlining would cause a correctness problem. + int CostEstimate = 0; + // If the inlining is mandatory, we won't use the cost, so can set it to 0 + if (!Mandatory) { + auto IsCallsiteInlinable = llvm::getInliningCostEstimate( + *CB, FAM.getResult(Callee), GetAssumptionCache, + {}, nullptr, nullptr); + if (!IsCallsiteInlinable) + continue; + CostEstimate = IsCallsiteInlinable.getValue(); + } + PendingInliningRecord PendingRecord; + if (Advisor) { + PendingRecord = + Advisor->shouldInline(CB, ShouldInline, Mandatory, CostEstimate); + } + if (!ShouldInline) { + // Check whether we want to inline this callsite. + if (!OIC.hasValue()) { + setInlineRemark(CS, "deferred"); + } else if (!OIC.getValue()) { + // shouldInline() call returned a negative inline cost that explains + // why this callsite should not be inlined. + setInlineRemark(CS, inlineCostStr(*OIC)); + } + if (PendingRecord) + PendingRecord.recordInlining(false, false); continue; } @@ -1097,14 +1156,17 @@ InlineResult IR = InlineFunction(CS, IFI); if (!IR.isSuccess()) { - setInlineRemark(CS, std::string(IR.getFailureReason()) + "; " + - inlineCostStr(*OIC)); + setInlineRemark( + CS, std::string(IR.getFailureReason()) + "; " + + (OIC.hasValue() ? inlineCostStr(*OIC) : "ML Advisor")); ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) << NV("Callee", &Callee) << " will not be inlined into " << NV("Caller", &F) << ": " << NV("Reason", IR.getFailureReason()); }); + if (PendingRecord) + PendingRecord.recordInlining(false, false); continue; } DidInline = true; @@ -1112,7 +1174,10 @@ ++NumInlined; - emit_inlined_into(ORE, DLoc, Block, Callee, F, *OIC); + // TODO(mtrofin): OIC may not have value if Advisor decided against + // inlining. We should still emit a remark. + if (OIC.hasValue()) + emit_inlined_into(ORE, DLoc, Block, Callee, F, *OIC); // Add any new callsites to defined functions to the worklist. if (!IFI.InlinedCallSites.empty()) { @@ -1144,6 +1209,7 @@ // dead. In that case, we can drop the body of the function eagerly // which may reduce the number of callers of other functions to one, // changing inline cost thresholds. + bool CalleeWasDeleted = false; if (Callee.hasLocalLinkage()) { // To check this we also need to nuke any dead constant uses (perhaps // made dead by this operation on other functions). @@ -1163,8 +1229,13 @@ assert(find(DeadFunctions, &Callee) == DeadFunctions.end() && "Cannot put cause a function to become dead twice!"); DeadFunctions.push_back(&Callee); + CalleeWasDeleted = true; + if (Advisor) + Advisor->OnFunctionDeleted(&Callee); } } + if (PendingRecord) + PendingRecord.recordInlining(CalleeWasDeleted, true); } // Back the call index up by one to put us in a good position to go around @@ -1243,8 +1314,7 @@ // function there. Also, cclear out any cached analyses. auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF)); FunctionAnalysisManager &FAM = - AM.getResult(DeadC, CG) - .getManager(); + AM.getResult(DeadC, CG).getManager(); FAM.clear(*DeadF, DeadF->getName()); AM.clear(DeadC, DeadC.getName()); auto &DeadRC = DeadC.getOuterRefSCC(); @@ -1256,7 +1326,19 @@ UR.InvalidatedRefSCCs.insert(&DeadRC); // And delete the actual function from the module. - M.getFunctionList().erase(DeadF); + // If we use the Advisor, it uses Function pointers to index various + // maps, e.g. memoization. Function cleanup passes like argument promotion + // create new functions. It is possible for a new function to be allocated + // at the address of a deleted function. + // We could index using names, but that's inefficient. Alternatively, + // we let the Advisor free the functions. + if (Advisor) { + DeadF->getBasicBlockList().clear(); + M.getFunctionList().remove(DeadF); + } else { + M.getFunctionList().erase(DeadF); + } + ++NumDeleted; } Index: llvm/test/Bindings/Go/lit.local.cfg =================================================================== --- llvm/test/Bindings/Go/lit.local.cfg +++ llvm/test/Bindings/Go/lit.local.cfg @@ -9,6 +9,9 @@ if not config.root.include_go_tests: config.unsupported = True +if config.use_ml_policies != '': + config.unsupported = True + def find_executable(executable, path=None): if path is None: path = os.environ['PATH'] Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -132,6 +132,8 @@ ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis ; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -226,6 +228,7 @@ ; CHECK-EP-CGSCC-LATE-NEXT: Running pass: NoOpCGSCCPass ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: GlobalOptPass Index: llvm/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-defaults.ll +++ llvm/test/Other/new-pm-thinlto-defaults.ll @@ -97,6 +97,8 @@ ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis ; CHECK-PRELINK-O-NEXT: Running analysis: ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting llvm::Module pass manager run. ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -196,6 +198,7 @@ ; CHECK-O-NEXT: Finished llvm::Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished llvm::Module pass manager run. +; CHECK-O-NEXT: Finished llvm::Module pass manager run. ; CHECK-PRELINK-O-NEXT: Running pass: GlobalOptPass ; CHECK-POSTLINK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-POSTLINK-O-NEXT: Starting llvm::Module pass manager run. Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -69,6 +69,8 @@ ; CHECK-O-NEXT: Running analysis: GlobalsAA ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -167,6 +169,7 @@ ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. +; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-O-NEXT: Starting {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: GlobalOptPass Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -77,6 +77,8 @@ ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA ; CHECK-O-NEXT: Running analysis: GlobalsAA ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -178,6 +180,7 @@ ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. +; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> ; CHECK-O-NEXT: Starting {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: GlobalOptPass Index: llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll @@ -96,6 +96,8 @@ ; CHECK-O-NEXT: Running analysis: GlobalsAA ; CHECK-O-NEXT: Running analysis: CallGraphAnalysis ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -216,6 +218,7 @@ ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. +; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Running analysis: TargetLibraryAnalysis on bar ; CHECK-O-NEXT: Running analysis: PassInstrumentationAnalysis on bar Index: llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll @@ -77,6 +77,8 @@ ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA ; CHECK-O-NEXT: Running analysis: GlobalsAA ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis +; CHECK-O-NEXT: Running pass: PassManager<{{.*}}Module{{.*}}> +; CHECK-O-NEXT: Starting {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}> ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis @@ -177,6 +179,7 @@ ; CHECK-O-NEXT: Finished {{.*}}Function pass manager run. ; CHECK-O-NEXT: Finished CGSCC pass manager run. ; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. +; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: GlobalOptPass ; CHECK-O-NEXT: Finished {{.*}}Module pass manager run. ; CHECK-O-NEXT: Running pass: NameAnonGlobalPass Index: llvm/test/Transforms/Inline/ML/ensure-delete.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Inline/ML/ensure-delete.ll @@ -0,0 +1,31 @@ +; RUN: opt -passes=scc-oz-module-inliner -ml-advisor-size-increase-threshold=0.5 -enable-ml-inliner=1 -S < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +declare i64 @f1() + +define internal i64 @f2() #0 { + %r = call i64 @f1() + %r2 = add i64 13, %r + ret i64 %r2 +} + +define internal i64 @indirect_caller(i1 %which) { + %f = select i1 %which, i64 ()* @f1, i64 ()* @f2 + %r = call i64 %f(), !callees !1 + ret i64 %r +} + +define i64 @top() { + %r = call i64 @indirect_caller(i1 1) + %r2 = call i64 @f2() + %r3 = add i64 %r, %r2 + ret i64 %r3 +} + +!1 = !{i64 ()* @f1, i64()* @f2} + +attributes #0 = { alwaysinline } + +; CHECK: !0 = distinct !{i64 ()* @f1, null} \ No newline at end of file Index: llvm/test/Transforms/Inline/ML/func-features.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Inline/ML/func-features.ll @@ -0,0 +1,61 @@ +; RUN: opt -enable-ml-inliner -mandatory-inlinings-first -passes=scc-oz-module-inliner -debug-training-log=- %{model_path} %{ir2native_path} -S < %s | FileCheck -check-prefix=CHECK -check-prefix=%{mode} %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +declare i32 @f1(i32) +declare i32 @f2(i32) + +define dso_local i32 @branches(i32) { + %cond = icmp slt i32 %0, 3 + br i1 %cond, label %then, label %else + +then: + %ret.1 = call i32 @f1(i32 %0) + br label %last.block + +else: + %ret.2 = call i32 @f2(i32 %0) + br label %last.block + +last.block: + %ret = phi i32 [%ret.1, %then], [%ret.2, %else] + ret i32 %ret +} + +define dso_local i32 @top() { + %1 = call i32 @branches(i32 2) + ret i32 %1 +} + + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 7.0.0-6 (tags/RELEASE_700/final)"} + +; CHECK-LABEL: ModuleName: +; DEV-NEXT: InitialSize: 51 +; REL-NEXT: InitialSize: 5 +; CHECK-NEXT: Inlining Decision: 0 +; CHECK-NEXT: DeltaSize: 0 +; CHECK-NEXT: Success: 1 +; CHECK-NEXT: CallerName: top +; CHECK-NEXT: CalleeName: branches +; CHECK-NEXT: Features: +; CHECK-NEXT: callee_basic_block_count: 4 +; CHECK-NEXT: callsite_height: 1 +; CHECK-NEXT: node_count: 2 +; CHECK-NEXT: nr_ctant_params: 1 +; CHECK-NEXT: cost_estimate: 0 +; CHECK-NEXT: edge_count: 1 +; CHECK-NEXT: caller_users: 1 +; CHECK-NEXT: caller_conditionally_executed_blocks: 0 +; CHECK-NEXT: caller_basic_block_count: 1 +; CHECK-NEXT: callee_conditionally_executed_blocks: 2 +; CHECK-NEXT: callee_users: 2 +; CHECK-NEXT: FinalNodeCount: 2 +; CHECK-NEXT: FinalEdgeCount: 0 +; DEV-NEXT: FinalSize: 51 +; REL-NEXT: FinalSize: 5 \ No newline at end of file Index: llvm/test/Transforms/Inline/ML/graph-structure.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/Inline/ML/graph-structure.ll @@ -0,0 +1,131 @@ +; RUN: opt -passes=scc-oz-module-inliner -debug-training-log=- -enable-ml-inliner=1 %{model_path} %{ir2native_path} -S < %s | FileCheck -check-prefix=CHECK -check-prefix=%{mode} %s +; Test that we can collect a log in 'dev' mode, and that the log captures both successful and unsuccessful decisions +; RUN: opt -passes=scc-oz-module-inliner -debug-training-log=- -enable-ml-inliner=1 %{ir2native_path} -S < %s | FileCheck -check-prefix=CHECK-LOG-%{mode} %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +declare void @external_fct(i32) + +define dso_local i32 @top() { + %a = call i32 @multiplier(i32 5) + %b = call i32 @adder(i32 10) + %ret = add nsw i32 %a, %b + call void @external_fct(i32 %ret) + ret i32 %ret +} + +define internal dso_local i32 @adder(i32) { + %2 = alloca i32, align 4 + store i32 %0, i32* %2, align 4 + %3 = load i32, i32* %2, align 4 + %4 = call i32 @multiplier(i32 %3) + %5 = load i32, i32* %2, align 4 + %6 = call i32 @switcher(i32 1) + %7 = add nsw i32 %4, %6 + ret i32 %7 +} + +define internal i32 @multiplier(i32) { + %2 = alloca i32, align 4 + store i32 %0, i32* %2, align 4 + %3 = load i32, i32* %2, align 4 + %4 = load i32, i32* %2, align 4 + %5 = mul nsw i32 %3, %4 + ret i32 %5 +} + +define i32 @switcher(i32) { + %2 = alloca i32, align 4 + %3 = alloca i32, align 4 + store i32 %0, i32* %3, align 4 + %4 = load i32, i32* %3, align 4 + switch i32 %4, label %11 [ + i32 1, label %5 + i32 2, label %6 + ] + +;