Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -16,6 +16,9 @@ #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include #include @@ -48,8 +51,51 @@ /// Do not inline functions which allocate this many bytes on the stack /// when the caller is recursive. const unsigned TotalAllocaSizeRecursiveCaller = 1024; +// We give preference (by reducing the cost by +// StaticSmallConstantTripCountBonus) to inlining of routines which make +// a loop in the called routine's code go from being unknown to a small +// constant (less than or equal to SmallTripCountMax). This was motivated +// by seeing such loops getting profitably unrolled. The value is equal to +// UnrollMaxIterationsCountToAnalyze from +// llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp. +const int SmallTripCountMax = 10; +const int StaticSmallConstantTripCountBonus = 410; } +/// \brief A cache to save loop related info during inlining of an SCC. +/// +/// This cache is used to store the DominatorTree, LoopInfo, and +/// ScalarEvolution for called functions which are candidates for +/// inlining. Unnecessarily recomputing them can add significantly to +/// compile time and memory consumption. +/// +/// The DominatorTree, LoopInfo, and ScalarEvolution for a function is +/// invalidated when someother function is inlined into it. +/// +typedef std::map DTMap; +typedef std::map LIMap; +typedef std::map SEMap; + +class InliningLoopInfoCache { + + DTMap DTMapSCC; + LIMap LIMapSCC; + SEMap SEMapSCC; + +public: + + InliningLoopInfoCache() {} + + ~InliningLoopInfoCache(); + + DominatorTree* getDT(Function *F); + LoopInfo* getLI(Function* F); + ScalarEvolution* getSE(Function *F, TargetLibraryInfo& TLI, + std::function &GetAssumptionCache); + void invalidateFunction(Function *F); + +}; + /// \brief Represents the cost of inlining a function. /// /// This supports special values for functions which should "always" or @@ -170,7 +216,9 @@ InlineCost getInlineCost(CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, + TargetLibraryInfo &TLI, std::function &GetAssumptionCache, + InliningLoopInfoCache *ILIC, ProfileSummaryInfo *PSI); /// \brief Get an InlineCost with the callee explicitly specified. @@ -181,7 +229,9 @@ InlineCost getInlineCost(CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, + TargetLibraryInfo &TLI, std::function &GetAssumptionCache, + InliningLoopInfoCache *ILIC, ProfileSummaryInfo *PSI); /// \brief Minimal filter to detect invalid constructs for inlining. Index: include/llvm/Transforms/IPO/InlinerPass.h =================================================================== --- include/llvm/Transforms/IPO/InlinerPass.h +++ include/llvm/Transforms/IPO/InlinerPass.h @@ -79,6 +79,7 @@ protected: AssumptionCacheTracker *ACT; ProfileSummaryInfo *PSI; + InliningLoopInfoCache *ILIC; ImportedFunctionsInliningStatistics ImportedFunctionsStats; }; Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -21,11 +21,16 @@ #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/InstVisitor.h" @@ -33,6 +38,7 @@ #include "llvm/IR/Operator.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/ValueMapper.h" using namespace llvm; @@ -69,6 +75,10 @@ /// The TargetTransformInfo available for this compilation. const TargetTransformInfo &TTI; + TargetLibraryInfo &TLI; + + InliningLoopInfoCache* ILIC; + /// Getter for the cache of @llvm.assume intrinsics. std::function &GetAssumptionCache; @@ -201,11 +211,13 @@ public: CallAnalyzer(const TargetTransformInfo &TTI, + TargetLibraryInfo &TLI, InliningLoopInfoCache *ILIC, std::function &GetAssumptionCache, ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, const InlineParams &Params) - : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee), - CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), + : TTI(TTI), TLI(TLI), ILIC(ILIC), GetAssumptionCache(GetAssumptionCache), + PSI(PSI), F(Callee), CandidateCS(CSArg), Params(Params), + Threshold(Params.DefaultThreshold), Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), @@ -217,7 +229,7 @@ SROACostSavings(0), SROACostSavingsLost(0) {} bool analyzeCall(CallSite CS); - + bool propagatesConstantInnerLoopTripCount(CallSite &CS); int getThreshold() { return Threshold; } int getCost() { return Cost; } @@ -962,7 +974,8 @@ // out. Pretend to inline the function, with a custom threshold. auto IndirectCallParams = Params; IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; - CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams); + CallAnalyzer CA(TTI, TLI, ILIC, GetAssumptionCache, PSI, *F, CS, + IndirectCallParams); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // threshold to get the bonus we want to apply, but don't go below zero. @@ -1187,6 +1200,132 @@ return cast(ConstantInt::get(IntPtrTy, Offset)); } +/// \brief Accessor functions for the InliningLoopCacheInfo +/// +/// Note the dependence: ScalarEvolution -> LoopInfo -> DominatorTree +/// All dependent parts are built on demand if needed. +/// + +DominatorTree* InliningLoopInfoCache::getDT(Function* F) { + auto It = DTMapSCC.find(F); + if (It != DTMapSCC.end()) + return It->second; + auto ret = new DominatorTree(*F); + DTMapSCC.insert(std::make_pair(F, ret)); + return ret; +} + +LoopInfo* InliningLoopInfoCache::getLI(Function* F) { + auto It = LIMapSCC.find(F); + if (It != LIMapSCC.end()) + return It->second; + DominatorTree* DT = getDT(F); + assert(DT != nullptr); + auto ret = new LoopInfo(*DT); + LIMapSCC.insert(std::make_pair(F, ret)); + return ret; +} + +ScalarEvolution* InliningLoopInfoCache::getSE(Function* F, + TargetLibraryInfo& TLI, + std::function &GetAssumptionCache) { + auto It = SEMapSCC.find(F); + if (It != SEMapSCC.end()) + return It->second; + DominatorTree* DT = getDT(F); + assert(DT != nullptr); + LoopInfo* LI = getLI(F); + assert(LI != nullptr); + AssumptionCache AC = GetAssumptionCache(*F); + ScalarEvolution* ret = new ScalarEvolution(*F, TLI, AC, *DT, *LI); + SEMapSCC.insert(std::make_pair(F, ret)); + return ret; +} + +void InliningLoopInfoCache::invalidateFunction(Function* F) { + auto DTit = DTMapSCC.find(F); + if (DTit != DTMapSCC.end()) { + delete DTit->second; + DTMapSCC.erase(DTit); + } + auto LIit = LIMapSCC.find(F); + if (LIit != LIMapSCC.end()) { + delete LIit->second; + LIMapSCC.erase(LIit); + } + auto SEit = SEMapSCC.find(F); + if (SEit != SEMapSCC.end()) { + delete SEit->second; + SEMapSCC.erase(SEit); + } +} + +InliningLoopInfoCache::~InliningLoopInfoCache() { + for (auto &DTI: DTMapSCC) + delete DTI.second; + DTMapSCC.clear(); + for (auto <I: LIMapSCC) + delete LTI.second; + LIMapSCC.clear(); + for (auto &STI: SEMapSCC) + delete STI.second; + SEMapSCC.clear(); +} + +/// \brief Heuristic for preferring inlining that exposes complete unrolling +/// +/// Return true if inlining CS will propagate a constant into the called +/// function F, which then lets us determine that an inner loop in F is +/// a small constant (and hence a good candidate for unrolling). +/// +bool CallAnalyzer::propagatesConstantInnerLoopTripCount(CallSite &CS) { + Function *F = CS.getCalledFunction(); + auto PI = F->arg_begin(), PE = F->arg_end(); + auto AI = CS.arg_begin(), AE = CS.arg_end(); + + for (; AI != AE; ++AI) { + Value *Actual = *AI; + if (PI == PE) + break; + ConstantInt *UB = dyn_cast(Actual); + if (UB == nullptr) { + PI++; + continue; + } + Argument *Formal = &*PI++; + for (Use &U : Formal->uses()) { + auto ICmp = dyn_cast(U.getUser()); + if (ICmp == nullptr) + continue; + BasicBlock *LatchBlock = ICmp->getParent(); + Loop *L = ILIC->getLI(F)->getLoopFor(LatchBlock); + if (L == nullptr || !L->empty() || L->getLoopLatch() != LatchBlock) + continue; + for (Use &UU : ICmp->uses()) { + if (UU.getUser() != LatchBlock->getTerminator()) + continue; + ScalarEvolution* SE = ILIC->getSE(F, TLI, GetAssumptionCache); + const SCEV *FBackCount = SE->getBackedgeTakenCount(L); + ValueToValueMap FormalToConstantActualParameterMap; + FormalToConstantActualParameterMap.insert( + std::make_pair(Formal, Actual)); + const SCEV *ABackCount + = SCEVParameterRewriter::rewrite(FBackCount, *SE, + FormalToConstantActualParameterMap, true); + const SCEVConstant *ABackCountConst + = dyn_cast(ABackCount); + if (ABackCountConst == nullptr) + continue; + uint64_t BackCount = ABackCountConst->getValue()->getZExtValue(); + if (BackCount >= InlineConstants::SmallTripCountMax) + continue; + return true; + } + } + } + return false; +} + /// \brief Analyze a call site for potential inlining. /// /// Returns true if inlining this call is viable, and false if it is not @@ -1263,6 +1402,15 @@ if (OnlyOneCallAndLocalLinkage) Cost -= InlineConstants::LastCallToStaticBonus; + // If a function is preferred for inlining, and may expose an unrolling + // opportunity, drop the cost somewhat. + bool StaticConstantTripCount + = (F.hasLocalLinkage() || F.hasLinkOnceODRLinkage()) && + (&F == CS.getCalledFunction()) && F.hasFnAttribute(Attribute::InlineHint) && + propagatesConstantInnerLoopTripCount(CS); + if (StaticConstantTripCount) + Cost -= InlineConstants::StaticSmallConstantTripCountBonus; + // If this function uses the coldcc calling convention, prefer not to inline // it. if (F.getCallingConv() == CallingConv::Cold) @@ -1448,17 +1596,21 @@ } InlineCost llvm::getInlineCost( - CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, + CallSite CS, const InlineParams &Params, + TargetTransformInfo &CalleeTTI, TargetLibraryInfo &TLI, std::function &GetAssumptionCache, + InliningLoopInfoCache *ILIC, ProfileSummaryInfo *PSI) { return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, - GetAssumptionCache, PSI); + TLI, GetAssumptionCache, ILIC, PSI); } InlineCost llvm::getInlineCost( CallSite CS, Function *Callee, const InlineParams &Params, TargetTransformInfo &CalleeTTI, + TargetLibraryInfo &TLI, std::function &GetAssumptionCache, + InliningLoopInfoCache *ILIC, ProfileSummaryInfo *PSI) { // Cannot inline indirect calls. @@ -1493,7 +1645,8 @@ DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params); + CallAnalyzer CA(CalleeTTI, TLI, ILIC, GetAssumptionCache, PSI, *Callee, + CS, Params); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -56,11 +56,13 @@ InlineCost getInlineCost(CallSite CS) override { Function *Callee = CS.getCalledFunction(); TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + TargetLibraryInfo &TLI = TLIWP->getTLI(); std::function GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI); + return llvm::getInlineCost(CS, Params, TTI, TLI, GetAssumptionCache, + ILIC, PSI); } bool runOnSCC(CallGraphSCC &SCC) override; @@ -68,7 +70,7 @@ private: TargetTransformInfoWrapperPass *TTIWP; - + TargetLibraryInfoWrapperPass *TLIWP; }; } // end anonymous namespace @@ -101,10 +103,12 @@ bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { TTIWP = &getAnalysis(); + TLIWP = &getAnalysis(); return Inliner::runOnSCC(SCC); } void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); Inliner::getAnalysisUsage(AU); } Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -425,6 +425,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, std::function GetAssumptionCache, ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI, + InliningLoopInfoCache *ILIC, bool InsertLifetime, function_ref GetInlineCost, function_ref AARGetter, @@ -569,7 +570,7 @@ continue; } ++NumInlined; - + ILIC->invalidateFunction(Caller); // Report the inline decision. ORE.emit(OptimizationRemark(DEBUG_TYPE, "Inlined", DLoc, Block) << NV("Callee", Callee) << " inlined into " @@ -606,6 +607,7 @@ CalleeNode->removeAllCalledFunctions(); // Removing the node for callee from the call graph and delete it. + ILIC->invalidateFunction(Callee); delete CG.removeFunctionFromModule(CalleeNode); ++NumDeleted; } @@ -648,9 +650,14 @@ auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime, + ILIC = new InliningLoopInfoCache(); + bool rv = inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, ILIC, + InsertLifetime, [this](CallSite CS) { return getInlineCost(CS); }, AARGetter, ImportedFunctionsStats); + delete ILIC; + ILIC = nullptr; + return rv; } /// Remove now-dead linkonce functions at the end of