Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -102,6 +102,7 @@ class InlineCostAnalysis : public CallGraphSCCPass { TargetTransformInfoWrapperPass *TTIWP; AssumptionCacheTracker *ACT; + bool EnableDelayedInline; public: static char ID; @@ -136,6 +137,11 @@ /// \brief Minimal filter to detect invalid constructs for inlining. bool isInlineViable(Function &Callee); + + /// \brief Sets delayed inline mode which affects inline cost computation. + void enableDelayedInline(bool Value) { + EnableDelayedInline = Value; + } }; } Index: include/llvm/Bitcode/LLVMBitCodes.h =================================================================== --- include/llvm/Bitcode/LLVMBitCodes.h +++ include/llvm/Bitcode/LLVMBitCodes.h @@ -397,7 +397,8 @@ ATTR_KIND_IN_ALLOCA = 38, ATTR_KIND_NON_NULL = 39, ATTR_KIND_JUMP_TABLE = 40, - ATTR_KIND_DEREFERENCEABLE = 41 + ATTR_KIND_DEREFERENCEABLE = 41, + ATTR_KIND_DELAYED_INLINE = 42 }; enum ComdatSelectionKindCodes { Index: include/llvm/IR/Attributes.h =================================================================== --- include/llvm/IR/Attributes.h +++ include/llvm/IR/Attributes.h @@ -112,6 +112,8 @@ SanitizeMemory, ///< MemorySanitizer is on. UWTable, ///< Function must be in a unwind table ZExt, ///< Zero extended before/after call + // FIXME: how to create a hidden attribute for compiler use only. + DelayedInline, ///< Function is in delayed inlining mode EndAttrKinds ///< Sentinal value useful for loops }; @@ -274,8 +276,15 @@ /// attribute list. Since attribute lists are immutable, this returns the new /// list. AttributeSet removeAttribute(LLVMContext &C, unsigned Index, + StringRef Kind) const; + + /// \brief Remove the specified attribute at the specified index from this + /// attribute list. Since attribute lists are immutable, this returns the new + /// list. + AttributeSet removeAttribute(LLVMContext &C, unsigned Index, Attribute::AttrKind Attr) const; + /// \brief Remove the specified attributes at the specified index from this /// attribute list. Since attribute lists are immutable, this returns the new /// list. Index: include/llvm/IR/Function.h =================================================================== --- include/llvm/IR/Function.h +++ include/llvm/IR/Function.h @@ -190,6 +190,12 @@ getContext(), AttributeSet::FunctionIndex, N)); } + /// @brief Remove function attributes from this function. + void removeFnAttr(StringRef Kind) { + setAttributes(AttributeSets.removeAttribute( + getContext(), AttributeSet::FunctionIndex, Kind)); + } + /// @brief Add function attributes to this function. void addFnAttr(StringRef Kind) { setAttributes( Index: include/llvm/Transforms/IPO/InlinerPass.h =================================================================== --- include/llvm/Transforms/IPO/InlinerPass.h +++ include/llvm/Transforms/IPO/InlinerPass.h @@ -74,6 +74,10 @@ /// deal with that subset of the functions. bool removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly = false); +protected: + // Delayed inline mode affects how inline cost is computed. + bool EnableDelayedInline; + private: // InlineThreshold - Cache the value here for easy access. unsigned InlineThreshold; Index: lib/Analysis/IPA/InlineCost.cpp =================================================================== --- lib/Analysis/IPA/InlineCost.cpp +++ lib/Analysis/IPA/InlineCost.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ConstantFolding.h" @@ -56,6 +57,7 @@ int Threshold; int Cost; + int BodyCost; bool IsCallerRecursive; bool IsRecursiveCall; @@ -145,7 +147,7 @@ CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, Function &Callee, int Threshold) : TTI(TTI), ACT(ACT), F(Callee), Threshold(Threshold), Cost(0), - IsCallerRecursive(false), IsRecursiveCall(false), + BodyCost(0), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0), @@ -159,6 +161,7 @@ int getThreshold() { return Threshold; } int getCost() { return Cost; } + int getBodyCost() { return BodyCost; } // Keep a bunch of stats about the cost savings found so we can print them // out when debugging. @@ -754,14 +757,25 @@ } if (TTI.isLoweredToCall(F)) { - // We account for the average 1 instruction per call argument setup - // here. - Cost += CS.arg_size() * InlineConstants::InstrCost; - - // Everything other than inline ASM will also have a significant cost - // merely from making the call. - if (!isa(CS.getCalledValue())) - Cost += InlineConstants::CallPenalty; + // Check if there is cached body cost. + if (F->hasFnAttribute("CICost")) { + Attribute Attr = F->getFnAttribute("CICost"); + int BodyCost = 0; + if (!Attr.getValueAsString().getAsInteger(10, BodyCost)) + Cost += BodyCost; + else + Cost += InlineConstants::CallPenalty; + + } else { + // We account for the average 1 instruction per call argument setup + // here. + Cost += CS.arg_size() * InlineConstants::InstrCost; + + // Everything other than inline ASM will also have a significant cost + // merely from making the call. + if (!isa(CS.getCalledValue())) + Cost += InlineConstants::CallPenalty; + } } return Base::visitCallSite(CS); @@ -1135,6 +1149,7 @@ // the callee, so this is purely duplicate work). SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues); + BodyCost = Cost; // The worklist of live basic blocks in the callee *after* inlining. We avoid // adding basic blocks of the callee which can be proven to be dead for this @@ -1230,6 +1245,7 @@ return false; Threshold += VectorBonus; + BodyCost = Cost - BodyCost; return Cost < Threshold; } @@ -1263,7 +1279,7 @@ char InlineCostAnalysis::ID = 0; -InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID) {} +InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID), EnableDelayedInline(false) {} InlineCostAnalysis::~InlineCostAnalysis() {} @@ -1341,6 +1357,26 @@ // Check if there was a reason to force inlining or no inlining. if (!ShouldInline && CA.getCost() < CA.getThreshold()) return InlineCost::getNever(); + + if (ShouldInline && EnableDelayedInline) { + // For every Cx, we record its body cost to make a better estimate on + // the real cost penalty of Cx when inlining B to A in delayed inlining + // mode. For normal process. + Function *F = CS.getCalledFunction(); + int BodyCost = CA.getBodyCost(); + + // If we set body cost to the real cost, it will make delayed inlining + // useless for A->B->C problem (body cost can make B too big). + // Because no matter what order, the cost of B will be one + // after C inlined to B. B will not be inlined. + // So we must decrease the cost to be a lower one to boost the local + // threshold larger that the one set. + if (BodyCost > InlineConstants::CallPenalty) + BodyCost = InlineConstants::CallPenalty; + StringRef CostStr = itostr(BodyCost); + F->addFnAttr("CICost", CostStr); + } + if (ShouldInline && CA.getCost() >= CA.getThreshold()) return InlineCost::getAlways(); Index: lib/Bitcode/Reader/BitcodeReader.cpp =================================================================== --- lib/Bitcode/Reader/BitcodeReader.cpp +++ lib/Bitcode/Reader/BitcodeReader.cpp @@ -1133,6 +1133,8 @@ return Attribute::UWTable; case bitc::ATTR_KIND_Z_EXT: return Attribute::ZExt; + case bitc::ATTR_KIND_DELAYED_INLINE: + return Attribute::DelayedInline; } } Index: lib/Bitcode/Writer/BitcodeWriter.cpp =================================================================== --- lib/Bitcode/Writer/BitcodeWriter.cpp +++ lib/Bitcode/Writer/BitcodeWriter.cpp @@ -240,6 +240,8 @@ return bitc::ATTR_KIND_UW_TABLE; case Attribute::ZExt: return bitc::ATTR_KIND_Z_EXT; + case Attribute::DelayedInline: + return bitc::ATTR_KIND_DELAYED_INLINE; case Attribute::EndAttrKinds: llvm_unreachable("Can not encode end-attribute kinds marker."); case Attribute::None: Index: lib/IR/Attributes.cpp =================================================================== --- lib/IR/Attributes.cpp +++ lib/IR/Attributes.cpp @@ -249,6 +249,8 @@ return "zeroext"; if (hasAttribute(Attribute::Cold)) return "cold"; + if (hasAttribute(DelayedInline)) + return "delayed_inline"; // FIXME: These should be output like this: // @@ -426,6 +428,7 @@ case Attribute::InAlloca: return 1ULL << 43; case Attribute::NonNull: return 1ULL << 44; case Attribute::JumpTable: return 1ULL << 45; + case Attribute::DelayedInline: return 1ULL << 46; case Attribute::Dereferenceable: llvm_unreachable("dereferenceable attribute not supported in raw format"); } @@ -790,6 +793,15 @@ return removeAttributes(C, Index, AttributeSet::get(C, Index, Attr)); } +AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index, + StringRef Kind) const { + if (!hasAttribute(Index, Kind)) + return *this; + llvm::AttrBuilder B; + B.addAttribute(Kind); + return removeAttributes(C, Index, AttributeSet::get(C, Index, B)); +} + AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index, AttributeSet Attrs) const { if (!pImpl) return AttributeSet(); Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -934,7 +934,8 @@ I->getKindAsEnum() == Attribute::NoBuiltin || I->getKindAsEnum() == Attribute::Cold || I->getKindAsEnum() == Attribute::OptimizeNone || - I->getKindAsEnum() == Attribute::JumpTable) { + I->getKindAsEnum() == Attribute::JumpTable || + I->getKindAsEnum() == Attribute::DelayedInline) { if (!isFunction) { CheckFailed("Attribute '" + I->getAsString() + "' only applies to functions!", V); Index: lib/Transforms/IPO/InlineAlways.cpp =================================================================== --- lib/Transforms/IPO/InlineAlways.cpp +++ lib/Transforms/IPO/InlineAlways.cpp @@ -108,6 +108,7 @@ bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) { ICA = &getAnalysis(); + ICA->enableDelayedInline(EnableDelayedInline); return Inliner::runOnSCC(SCC); } Index: lib/Transforms/IPO/InlineSimple.cpp =================================================================== --- lib/Transforms/IPO/InlineSimple.cpp +++ lib/Transforms/IPO/InlineSimple.cpp @@ -96,6 +96,7 @@ bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) { ICA = &getAnalysis(); + ICA->enableDelayedInline(EnableDelayedInline); return Inliner::runOnSCC(SCC); } Index: lib/Transforms/IPO/Inliner.cpp =================================================================== --- lib/Transforms/IPO/Inliner.cpp +++ lib/Transforms/IPO/Inliner.cpp @@ -61,6 +61,10 @@ ColdThreshold("inlinecold-threshold", cl::Hidden, cl::init(225), cl::desc("Threshold for inlining functions with cold attribute")); +cl::opt +EnableDelayedInlineOpt("enable-delayed-inline", cl::Hidden, + cl::init(false), cl::ZeroOrMore); + // Threshold to use when optsize is specified (and there is no -inline-limit). const int OptSizeThreshold = 75; @@ -422,6 +426,57 @@ return false; } +static bool toBeVisitedAgain(Function *F) { + if (F->use_empty()) + return false; + + // Check if any use of F is a call instruction of specific type. + for (User *U : F->users()) { + if (CallInst *I = dyn_cast(U)) { + CallSite CS(I); + Function *Callee = CS.getCalledFunction(); + // We must ensure this call will be processed again. + if (Callee && !Callee->isDeclaration() && Callee == F) + return true; + } + } + + return false; +} + +static unsigned +insertFunctionCallSites(Function *F, + SmallVectorImpl>& CallSites, + unsigned CSi) { + unsigned Count = 0; + + for (auto &BB : *F) + for (auto &Inst : BB) { + auto *I = &Inst; + // If this isn't a call, or it is a call to an intrinsic, it can + // never be inlined. + if (!isa(I) || isa(I)) + continue; + + CallSite CS(I); + // If this is a direct call to an external function, we can never inline + // it. If it is an indirect call, inlining may resolve it to be a + // direct call, so we keep it. + if (!CS.getCalledFunction() || CS.getCalledFunction()->isDeclaration()) + continue; + + if (CS.getCalledFunction() == F) + continue; + + CallSites.insert(CallSites.begin() + CSi, std::make_pair(CS, -1)); + DEBUG(dbgs() << " -> Add delayed call site: " + << *CS.getInstruction() << "\n"); + ++Count; + } + + return Count; +} + bool Inliner::runOnSCC(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis().getCallGraph(); AssumptionCacheTracker *ACT = &getAnalysis(); @@ -429,6 +484,8 @@ const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; AliasAnalysis *AA = &getAnalysis(); + EnableDelayedInline = EnableDelayedInlineOpt; + SmallPtrSet SCCFunctions; DEBUG(dbgs() << "Inliner visiting SCC:"); for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { @@ -488,10 +545,76 @@ InlinedArrayAllocasTy InlinedArrayAllocas; InlineFunctionInfo InlineInfo(&CG, AA, ACT); + // For solving A->B->Cx problem,we check if all calls (Cx) in function B + // will be inlined. If yes, we don't inline any of them, we wait for the time + // when testing B->A by putting B into delayed mode. The estimated cost for + // inlined Cx will be recorded. + // + // If B is inlined into A with estimated cost of inlined Cx, all Cx will be + // revisited and inlined, no extra action to do. Ff B is not inlined into A + // with estimated cost of inlined Cx, all Cx will be inlined into B and + // re-test if can inline B->A again. + if (EnableDelayedInline) { + bool ShouldInlineAll = true; + DEBUG(dbgs() << " BEGIN - Evaluating Delay inlining action: \n"); + for (unsigned CSi = 0, E = CallSites.size(); CSi != E; ++CSi) { + CallSite CS = CallSites[CSi].first; + + Function *Callee = CS.getCalledFunction(); + + if (!isInstructionTriviallyDead(CS.getInstruction(), TLI)) { + // We can only inline direct calls to non-declarations. + if (!Callee || Callee->isDeclaration()) + continue; + + // Check if the policy determines that we should inline this function. + DEBUG(dbgs() << " EVAL: "); + if (!shouldInline(CS)) { + ShouldInlineAll = false; + break; + } + } + } + + if (ShouldInlineAll) { + // Set delayed mode for all functions in this SCC only if there is + // function A that calls B, in the A->B->C case. + bool ToBeVisitedAgain = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + Function *F = (*I)->getFunction(); + if (!F) + continue; + + if (toBeVisitedAgain(F)) { + ToBeVisitedAgain = true; + break; + } + } + + if (ToBeVisitedAgain) { + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; + ++I) { + Function *F = (*I)->getFunction(); + if (!F) + continue; + + // Marking B as delayed. + F->addFnAttr(Attribute::DelayedInline); + DEBUG(dbgs() << " Delay inlining flag set for all calls in: " + << F->getName() << "\n"); + } + // Do not try inlining anymore at this point. + return false; + } + } + DEBUG(dbgs() << " END - Evaluating Delay inlining action. \n"); + } + // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. bool Changed = false; bool LocalChange; + unsigned DelayedCount = 0; do { LocalChange = false; // Iterate over the outer loop because inlining functions can cause indirect @@ -510,6 +633,8 @@ DEBUG(dbgs() << " -> Deleting dead call: " << *CS.getInstruction() << "\n"); // Update the call graph by deleting the edge from Callee to Caller. + if (Callee->hasFnAttribute(Attribute::DelayedInline)) + Callee->removeFnAttr(Attribute::DelayedInline); CG[Caller]->removeCallEdgeFor(CS); CS.getInstruction()->eraseFromParent(); ++NumCallsDeleted; @@ -532,13 +657,42 @@ // Get DebugLoc to report. CS will be invalid after Inliner. DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + bool InDelayedMakeupProcess = false; + bool IsCalleeDelayedFunc = false; + if (DelayedCount > 0) { + DEBUG(dbgs() << "D#" << DelayedCount << " "); + --DelayedCount; + InDelayedMakeupProcess = true; + if (Callee->hasFnAttribute(Attribute::DelayedInline)) + IsCalleeDelayedFunc = true; + } + // If the policy determines that we should inline this function, // try to do so. if (!shouldInline(CS)) { + if (Callee->hasFnAttribute(Attribute::DelayedInline)) { + // We need to add delayed call sites back to make sure they + // are inlined correctly. + // FIXME: Note in the A->B->C->D case, + // we are visiting callee B of A. We insert C call sites + // before B. This will cause inline B<-C first, while + // the bottom up mode inlines C<-D and then B-<-C. + DelayedCount += insertFunctionCallSites(Callee, CallSites, CSi); + + // And re-test after all delayed call sites are inlined. + Callee->removeFnAttr(Attribute::DelayedInline); + ++DelayedCount; + --CSi; + continue; + } emitOptimizationRemarkMissed(CallerCtx, DEBUG_TYPE, *Caller, DLoc, Twine(Callee->getName() + " will not be inlined into " + Caller->getName())); + if (InDelayedMakeupProcess) { + CallSites.erase(CallSites.begin() + CSi); + --CSi; + } continue; } @@ -553,6 +707,20 @@ } ++NumInlined; + if (Callee->hasFnAttribute(Attribute::DelayedInline) && + !toBeVisitedAgain(Callee) && + !(Callee->use_empty() && Callee->hasLocalLinkage() && + !SCCFunctions.count(Callee) && + CG[Callee]->getNumReferences() == 0)) { + // B was inlined into A above. If B is marked as delayed and it is + // an extern function and it is the last time visiting B, so we need + // to make sure B has C inlined. + DelayedCount += insertFunctionCallSites(Callee, CallSites, CSi + 1); + // This callee will be inlined. It is safe to + // remove delayedinline flag now. + Callee->removeFnAttr(Attribute::DelayedInline); + } + // Report the inline decision. emitOptimizationRemark( CallerCtx, DEBUG_TYPE, *Caller, DLoc, @@ -569,7 +737,20 @@ for (unsigned i = 0, e = InlineInfo.InlinedCalls.size(); i != e; ++i) { Value *Ptr = InlineInfo.InlinedCalls[i]; - CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID)); + + CallSite NewCS(Ptr); + if (InDelayedMakeupProcess) { + // Only consider inlined funcs if the callee is delayed function. + // We have to put them before the next candidate because they + // originally should be inlined already. + if (IsCalleeDelayedFunc && NewCS.getCalledFunction() && + !NewCS.getCalledFunction()->isDeclaration()) { + CallSites.insert(CallSites.begin() + CSi + 1, + std::make_pair(NewCS, NewHistoryID)); + ++DelayedCount; + } + } else + CallSites.push_back(std::make_pair(NewCS, NewHistoryID)); } } } @@ -600,7 +781,7 @@ // swap/pop_back for efficiency, but do not use it if doing so would // move a call site to a function in this SCC before the // 'FirstCallInSCC' barrier. - if (SCC.isSingular()) { + if (SCC.isSingular() && DelayedCount == 0) { CallSites[CSi] = CallSites.back(); CallSites.pop_back(); } else { @@ -634,6 +815,12 @@ if (!F || F->isDeclaration()) continue; + // Remove inline internal attributes + if (F->hasFnAttribute(Attribute::DelayedInline)) + F->removeFnAttr(Attribute::DelayedInline); + if (F->hasFnAttribute("CICost")) + F->removeFnAttr("CICost"); + // Handle the case when this function is called and we only want to care // about always-inline functions. This is a bit of a hack to share code // between here and the InlineAlways pass.