Index: include/llvm/Transforms/Utils/CodeExtractor.h =================================================================== --- include/llvm/Transforms/Utils/CodeExtractor.h +++ include/llvm/Transforms/Utils/CodeExtractor.h @@ -83,6 +83,11 @@ BlockFrequencyInfo *BFI = nullptr, BranchProbabilityInfo *BPI = nullptr); + /// Return the costs of outlining the extracted region. + /// The first member of the returned tuple is the estimate size + /// cost and the second is runtime cost. + std::tuple computeOutliningCost(); + /// \brief Perform the extraction, returning the new function. /// /// Returns zero when called on a CodeExtractor instance where isEligible Index: lib/Transforms/IPO/PartialInlining.cpp =================================================================== --- lib/Transforms/IPO/PartialInlining.cpp +++ lib/Transforms/IPO/PartialInlining.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -42,6 +43,10 @@ static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial ininling")); +// This is an option used by testing: +static cl::opt + SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false), + cl::Hidden, cl::desc("Skip Cost Analysis")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -84,8 +89,6 @@ bool run(Module &M); Function *unswitchFunction(Function *F); - std::unique_ptr computeOutliningInfo(Function *F); - private: int NumPartialInlining = 0; std::function *GetAssumptionCache; @@ -98,6 +101,41 @@ return (MaxNumPartialInlining != -1 && NumPartialInlining >= MaxNumPartialInlining); } + CallSite getCallSite(User *U) { + CallSite CS; + if (CallInst *CI = dyn_cast(U)) + CS = CallSite(CI); + else if (InvokeInst *II = dyn_cast(U)) + CS = CallSite(II); + else + llvm_unreachable("All uses must be calls"); + return CS; + } + CallSite getOneCallSiteTo(Function *F) { + User *User = *F->user_begin(); + return getCallSite(User); + } + std::tuple getOneDebugLoc(Function *F) { + CallSite CS = getOneCallSiteTo(F); + + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + return std::make_tuple(DLoc, Block); + } + + // Returns true of the benefit of eliminating the call outweights the + // additional runtime cost associated with the new call to the outlined + // function (extracted). + bool isPartialInliningBeneficial(Function *F, FunctionOutliningInfo *OI); + // Returns the cost associated with function outlining: + // - The first value is the non-weighted runtime cost for the call sequence + // to the outlined function; + // - The second value is the code size estimate of the new call sequence. + // - The third value is the size estimate of the original code that is + // extracted into the outlined function. + std::tuple + computeOutliningCost(Function *F, const FunctionOutliningInfo *OutliningInfo); + std::unique_ptr computeOutliningInfo(Function *F); }; struct PartialInlinerLegacyPass : public ModulePass { @@ -293,6 +331,9 @@ OptimizationRemarkEmitter &ORE) { // TODO : more sharing with shouldInline in Inliner.cpp using namespace ore; + if (SkipCostAnalysis) + return true; + Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -316,7 +357,7 @@ } if (!IC) { - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) << NV("Callee", Callee) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" @@ -332,6 +373,102 @@ return true; } +std::tuple +PartialInlinerImpl::computeOutliningCost(Function *F, + const FunctionOutliningInfo *OI) { + std::vector OutlinedRegion; + + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(F, &(*GetAssumptionCache)(*F), EphValues); + + int OutlinedRegionSize = 0; + for (BasicBlock &BB : *F) { + if (&BB != OI->ReturnBlock && + // Assuming Entry set is small -- do a linear search here. + std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == + OI->Entries.end()) { + OutlinedRegion.push_back(&BB); + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) { + if (isa(I)) + continue; + if (EphValues.count(&*I)) + continue; + OutlinedRegionSize += InlineConstants::InstrCost; + } + } + } + + int OutliningSizeCost, OutliningRuntimeCost; + std::tie(OutliningSizeCost, OutliningRuntimeCost) = + CodeExtractor(OutlinedRegion, nullptr, false, nullptr, nullptr) + .computeOutliningCost(); + return std::make_tuple(OutliningSizeCost, OutliningRuntimeCost, + OutlinedRegionSize); +} + +bool PartialInlinerImpl::isPartialInliningBeneficial( + Function *F, FunctionOutliningInfo *OI) { + int NonWeightedRcost; + int SizeCost; + int OutlinedRegionSize; + + if (SkipCostAnalysis) + return true; + + std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSize) = + computeOutliningCost(F, OI); + + OptimizationRemarkEmitter ORE(F); + using namespace ore; + DebugLoc DLoc; + BasicBlock *Block; + std::tie(DLoc, Block) = getOneDebugLoc(F); + + // The call sequence to the outlined function is larger than the original + // outlined region size, it does not increase the chances of inlining + // 'F' with outlining (The inliner using the size increase to model the + // the cost of inlining a callee). + if (OutlinedRegionSize < SizeCost) { + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", + DLoc, Block) + << NV("Function", F) + << " not partially inlined into callers (Original Size = " + << NV("OutlinedRegionOriginalSize", OutlinedRegionSize) + << ", Size of call sequence to outlined function = " + << NV("NewSize", SizeCost) << ")"); + return false; + } + + // Now compare with weighted runtime cost: + LoopInfo LI{DominatorTree(*F)}; + BranchProbabilityInfo BPI(*F, LI); + BlockFrequencyInfo BFI(*F, BPI, LI); + + BlockFrequency EntryFreq = BFI.getBlockFreq(&F->getEntryBlock()); + BlockFrequency OutlinedCallFreq = BFI.getBlockFreq(OI->NonReturnBlock); + CallSite CS = getOneCallSiteTo(F); + const DataLayout &DL = F->getParent()->getDataLayout(); + // The savings of eliminating the call: + int NonWeightedSavings = getCallsiteCost(CS, DL); + int Denom = (NonWeightedSavings > NonWeightedRcost ? NonWeightedSavings + : NonWeightedRcost); + BranchProbability NormWeightedSavings(NonWeightedSavings, Denom); + BranchProbability NormWeightedRcost(NonWeightedRcost, Denom); + // Weighted saving is smaller than weighted cost, return false + if (EntryFreq * NormWeightedSavings < OutlinedCallFreq * NormWeightedRcost) { + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", + DLoc, Block) + << NV("Function", F) + << " not partially inlined into callers, runtime overhead " + << " of making outlined call is too high"); + + return false; + } + + // Ok, we can partial inline this function. + return true; +} + Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (F->hasAddressTaken()) @@ -347,12 +484,18 @@ if (PSI->isFunctionEntryCold(F)) return nullptr; + if (F->user_begin() == F->user_end()) + return nullptr; + std::unique_ptr OutliningInfo = computeOutliningInfo(F); if (!OutliningInfo) return nullptr; + if (!isPartialInliningBeneficial(F, OutliningInfo.get())) + return nullptr; + // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; Function *DuplicateFunction = CloneFunction(F, VMap); @@ -452,13 +595,7 @@ DuplicateFunction->user_end()); for (User *User : Users) { - CallSite CS; - if (CallInst *CI = dyn_cast(User)) - CS = CallSite(CI); - else if (InvokeInst *II = dyn_cast(User)) - CS = CallSite(II); - else - llvm_unreachable("All uses must be calls"); + CallSite CS = getCallSite(User); if (IsLimitReached()) continue; Index: lib/Transforms/Utils/CodeExtractor.cpp =================================================================== --- lib/Transforms/Utils/CodeExtractor.cpp +++ lib/Transforms/Utils/CodeExtractor.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" @@ -655,6 +656,57 @@ } } +// The cost of outlining the region consists of the following parts: +// 0. The new call/return instruction +// 1. Input parmater passing +// 2. Output parameter stack allocation in caller +// 3. Output parameter pointer passing +// 4. Output parameter stores (at the exit of the outlined function) +// 5. Output parmaeter loads (right after the call to the outlined function) +// 6. For multiple exits region, the branch cost (conditional branch) incurred +// aftter the call to outlined function +// The first returned value is the size cost and the second is estimated +// runtime cost. The runtime cost models the increased runtime overhead +// due to the newly created instructions, while the size cost models the +// size impact of the call sequence to the outlined function. +std::tuple CodeExtractor::computeOutliningCost() { + ValueSet inputs, outputs; + SmallPtrSet ExitBlocks; + + if (!isEligible()) + return std::make_tuple(0, 0); + + int RCost = 0, SCost = 0; + + for (BasicBlock *Block : Blocks) { + if (dyn_cast(Block->getTerminator())) { + NumExitBlocks++; + continue; + } + for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE; + ++SI) { + if (!Blocks.count(*SI)) + NumExitBlocks++; + } + } + + // Find inputs to, outputs from the code region. + findInputsOutputs(inputs, outputs); + // TODO: check parameter attribute, callingConv etc. + RCost = (inputs.size() + outputs.size()) * InlineConstants::InstrCost; + // For each output parameter, there is cost associated with stack + // usage, before return stores, and after call re-loads: + RCost += 3 * outputs.size() * InlineConstants::InstrCost; + // Branch cost: + RCost += NumExitBlocks; + + SCost = RCost; + // Now the cost of calling the outlined function itself: + RCost += InlineConstants::CallPenalty; + SCost += InlineConstants::InstrCost; + return std::make_tuple(SCost, RCost); +} + void CodeExtractor::moveCodeToFunction(Function *newFunction) { Function *oldFunc = (*Blocks.begin())->getParent(); Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList(); Index: test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll =================================================================== --- test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll +++ test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This test checks to make sure that the CodeExtractor ; properly sets the entry count for the function that is Index: test/Transforms/CodeExtractor/MultipleExitBranchProb.ll =================================================================== --- test/Transforms/CodeExtractor/MultipleExitBranchProb.ll +++ test/Transforms/CodeExtractor/MultipleExitBranchProb.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s +; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This test checks to make sure that CodeExtractor updates ; the exit branch probabilities for multiple exit blocks. Index: test/Transforms/CodeExtractor/PartialInlineHighCost.ll =================================================================== --- test/Transforms/CodeExtractor/PartialInlineHighCost.ll +++ test/Transforms/CodeExtractor/PartialInlineHighCost.ll @@ -0,0 +1,63 @@ +; The outlined region has high frequency and the outlining +; call sequence is expensive (input, output, multiple exit etc) +; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=NOCOST %s +; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=NOCOST %s + + +; Function Attrs: nounwind +define i32 @bar(i32 %arg) local_unnamed_addr #0 { +bb: + %tmp = icmp slt i32 %arg, 0 + br i1 %tmp, label %bb1, label %bb16, !prof !1 + +bb1: ; preds = %bb + %tmp2 = tail call i32 (...) @foo() #0 + %tmp3 = tail call i32 (...) @foo() #0 + %tmp4 = tail call i32 (...) @foo() #0 + %tmp5 = tail call i32 (...) @foo() #0 + %tmp6 = tail call i32 (...) @foo() #0 + %tmp7 = tail call i32 (...) @foo() #0 + %tmp8 = add nsw i32 %arg, 1 + %tmp9 = tail call i32 @goo(i32 %tmp8) #0 + %tmp10 = tail call i32 (...) @foo() #0 + %tmp11 = icmp eq i32 %tmp10, 0 + br i1 %tmp11, label %bb12, label %bb16 + +bb12: ; preds = %bb1 + %tmp13 = tail call i32 (...) @foo() #0 + %tmp14 = icmp eq i32 %tmp13, 0 + %tmp15 = select i1 %tmp14, i32 0, i32 3 + br label %bb16 + +bb16: ; preds = %bb12, %bb1, %bb + %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ] + ret i32 %tmp17 +} + +; Function Attrs: nounwind +declare i32 @foo(...) local_unnamed_addr #0 + +; Function Attrs: nounwind +declare i32 @goo(i32) local_unnamed_addr #0 + +; Function Attrs: nounwind +define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 { +bb: +; CHECK-LABEL: @dummy_caller +; CHECK-NOT: br i1 +; CHECK-NOT: call{{.*}}bar. +; NOCOST-LABEL: @dummy_caller +; NOCOST: br i1 +; NOCOST: call{{.*}}bar. + %tmp = tail call i32 @bar(i32 %arg) + ret i32 %tmp +} + +attributes #0 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 5.0.0 (trunk 301898)"} +!1 = !{!"branch_weights", i32 2000, i32 1} Index: test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll =================================================================== --- test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll +++ test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -partial-inliner | llc -filetype=null -; RUN: opt < %s -partial-inliner -S | FileCheck %s +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null +; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s ; This testcase checks to see if CodeExtractor properly inherits ; target specific attributes for the extracted function. This can ; cause certain instructions that depend on the attributes to not