Index: lib/Analysis/InlineCost.cpp =================================================================== --- lib/Analysis/InlineCost.cpp +++ lib/Analysis/InlineCost.cpp @@ -126,8 +126,16 @@ /// Tunable parameters that control the analysis. const InlineParams &Params; + /// Upper bound for the inlining cost. Bonuses are being applied to account + /// for speculative "expected profit" of the inlining decision. int Threshold; - int Cost; + + /// Inlining cost measured in abstract units, accounts for all the + /// instructions expected to be executed for a given function invocation. + /// Instructions that are statically proven to be dead based on call-site + /// arguments are not counted here. + int Cost = 0; + bool ComputeFullInlineCost; bool IsCallerRecursive; @@ -143,9 +151,23 @@ /// Number of bytes allocated statically by the callee. uint64_t AllocatedSize; unsigned NumInstructions, NumVectorInstructions; - int VectorBonus, TenPercentVectorBonus; - // Bonus to be applied when the callee has only one reachable basic block. - int SingleBBBonus; + + /// Bonus to be applied when percentage of vector instructions in callee is + /// high (see more details in updateThreshold). + int VectorBonus = 0; + /// Bonus to be applied when the callee has only one reachable basic block. + int SingleBBBonus = 0; + + /// If there is only one call of the function, and it has internal linkage, + /// we can allow to inline substantially more. + int LastCallToStaticBonus = 0; + + /// When we are able to prove that inlining the callee will make inlining + /// of a particular nested call-site profitable, we want to give a certain + /// bonus with size of a callee deducted. + /// Tracking the actually applied bonus here, as well as adding it directly + /// to Threshold. + int InlinedCallBonus = 0; /// While we walk the potentially-inlined instructions, we build up and /// maintain a mapping of simplified values specific to this callsite. The @@ -270,6 +292,16 @@ bool visitCatchReturnInst(CatchReturnInst &RI); bool visitUnreachableInst(UnreachableInst &I); + void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { + Cost = std::min(UpperBound, Cost + Inc); + } + + void applyInlinedCallBonus(int Bonus) { + assert(Bonus >= 0); + Threshold += Bonus; + InlinedCallBonus += Bonus; + } + public: CallAnalyzer(const TargetTransformInfo &TTI, std::function &GetAssumptionCache, @@ -279,18 +311,17 @@ : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), - Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || - Params.ComputeFullInlineCost || ORE), + ComputeFullInlineCost(OptComputeFullInlineCost || + Params.ComputeFullInlineCost || ORE), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0), - NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), - SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0), - NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), - NumConstantPtrCmps(0), NumConstantPtrDiffs(0), - NumInstructionsSimplified(0), SROACostSavings(0), - SROACostSavingsLost(0) {} + NumInstructions(0), NumVectorInstructions(0), + EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), + NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), + NumConstantPtrDiffs(0), NumInstructionsSimplified(0), + SROACostSavings(0), SROACostSavingsLost(0) {} InlineResult analyzeCall(CallSite CS); @@ -341,7 +372,7 @@ void CallAnalyzer::disableSROA(DenseMap::iterator CostIt) { // If we're no longer able to perform SROA we need to undo its cost savings // and prevent subsequent analysis. - Cost += CostIt->second; + addCost(CostIt->second); SROACostSavings -= CostIt->second; SROACostSavingsLost += CostIt->second; SROAArgCosts.erase(CostIt); @@ -365,7 +396,7 @@ void CallAnalyzer::disableLoadElimination() { if (EnableLoadElimination) { - Cost += LoadEliminationCost; + addCost(LoadEliminationCost); LoadEliminationCost = 0; EnableLoadElimination = false; } @@ -720,7 +751,7 @@ case Instruction::FPToUI: case Instruction::FPToSI: if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - Cost += InlineConstants::CallPenalty; + addCost(InlineConstants::CallPenalty); break; default: break; @@ -886,7 +917,15 @@ // and the callsite. int SingleBBBonusPercent = 50; int VectorBonusPercent = 150; - int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; + + int LastCallToStaticBonus = 0; + bool OnlyOneCallAndLocalLinkage = + F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); + // If there is only one call of the function, and it has internal linkage, + // we can allow to inline pretty anything as it will lead to size reduction + // anyway. + if (OnlyOneCallAndLocalLinkage) + LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; // Lambda to set all the above bonus and bonus percentages to 0. auto DisallowAllBonuses = [&]() { @@ -959,20 +998,13 @@ } } - // Finally, take the target-specific inlining threshold multiplier into - // account. + // Take the target-specific inlining threshold multiplier into account. Threshold *= TTI.getInliningThresholdMultiplier(); SingleBBBonus = Threshold * SingleBBBonusPercent / 100; VectorBonus = Threshold * VectorBonusPercent / 100; - bool OnlyOneCallAndLocalLinkage = - F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); - // If there is only one call of the function, and it has internal linkage, - // the cost of inlining it drops dramatically. It may seem odd to update - // Cost in updateThreshold, but the bonus depends on the logic in this method. - if (OnlyOneCallAndLocalLinkage) - Cost -= LastCallToStaticBonus; + Threshold += LastCallToStaticBonus; } bool CallAnalyzer::visitCmpInst(CmpInst &I) { @@ -1089,7 +1121,7 @@ // as such. if (I.getType()->isFloatingPointTy() && TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) - Cost += InlineConstants::CallPenalty; + addCost(InlineConstants::CallPenalty); return false; } @@ -1228,7 +1260,7 @@ case Intrinsic::load_relative: // This is normally lowered to 4 LLVM instructions. - Cost += 3 * InlineConstants::InstrCost; + addCost(3 * InlineConstants::InstrCost); return false; case Intrinsic::memset: @@ -1257,12 +1289,12 @@ if (TTI.isLoweredToCall(F)) { // We account for the average 1 instruction per call argument setup // here. - Cost += CS.arg_size() * InlineConstants::InstrCost; + addCost(CS.arg_size() * InlineConstants::InstrCost); // Everything other than inline ASM will also have a significant cost // merely from making the call. if (!isa(CS.getCalledValue())) - Cost += InlineConstants::CallPenalty; + addCost(InlineConstants::CallPenalty); } if (!CS.onlyReadsMemory()) @@ -1276,7 +1308,7 @@ // First, pay the price of the argument setup. We account for the average // 1 instruction per call argument setup here. - Cost += CS.arg_size() * InlineConstants::InstrCost; + addCost(CS.arg_size() * InlineConstants::InstrCost); // Next, check if this happens to be an indirect function call to a known // function in this inline context. If not, we've done all we can. @@ -1297,9 +1329,10 @@ CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, IndirectCallParams); if (CA.analyzeCall(CS)) { - // We were able to inline the indirect call! Subtract the cost from the - // threshold to get the bonus we want to apply, but don't go below zero. - Cost -= std::max(0, CA.getThreshold() - CA.getCost()); + // We were able to inline the indirect call! Increase the threshold + // with the bonus we want to apply (less the cost of inlinee). + // Make sure the bonus doesn't go below zero. + applyInlinedCallBonus(std::max(0, CA.getThreshold() - CA.getCost())); } if (!F->onlyReadsMemory()) @@ -1438,7 +1471,7 @@ (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); if (CostLowerBound > Threshold && !ComputeFullInlineCost) { - Cost = CostLowerBound; + addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost); return false; } @@ -1452,7 +1485,7 @@ int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 4 * InlineConstants::InstrCost; - Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); + addCost(JTCost, (int64_t)CostUpperBound); return false; } @@ -1473,7 +1506,7 @@ // n + n / 2 - 1 = n * 3 / 2 - 1 if (NumCaseCluster <= 3) { // Suppose a comparison includes one compare and one conditional branch. - Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; + addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); return false; } @@ -1481,7 +1514,7 @@ int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; - Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); + addCost(SwitchCost, (int64_t)CostUpperBound); return false; } @@ -1574,7 +1607,7 @@ if (Base::visit(&*I)) ++NumInstructionsSimplified; else - Cost += InlineConstants::InstrCost; + addCost(InlineConstants::InstrCost); using namespace ore; // If the visit this instruction detected an uninlinable pattern, abort. @@ -1619,7 +1652,7 @@ return IR; } - // Check if we've past the maximum possible threshold so we don't spin in + // Check if we've passed the maximum possible threshold so we don't spin in // huge basic blocks that will never inline. if (Cost >= Threshold && !ComputeFullInlineCost) return false; @@ -1745,7 +1778,7 @@ // Give out bonuses for the callsite, as the instructions setting them up // will be gone after inlining. - Cost -= getCallsiteCost(CS, DL); + addCost(-getCallsiteCost(CS, DL)); // If this function uses the coldcc calling convention, prefer not to inline // it. @@ -1909,7 +1942,7 @@ continue; NumLoops++; } - Cost += NumLoops * InlineConstants::CallPenalty; + addCost(NumLoops * InlineConstants::CallPenalty); } // We applied the maximum possible vector bonus at the beginning. Now, Index: test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll =================================================================== --- test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll +++ test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll @@ -27,13 +27,13 @@ ; YAML-NEXT: - Caller: main ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' -; YAML-NEXT: - Cost: '-15000' +; YAML-NEXT: - Cost: '0' ; YAML-NEXT: - String: ', threshold=' -; YAML-NEXT: - Threshold: '337' +; YAML-NEXT: - Threshold: '15337' ; YAML-NEXT: - String: ')' ; YAML-NEXT: ... -; CHECK: tinkywinky inlined into main with (cost=-15000, threshold=337) (hotness: 300) +; CHECK: tinkywinky inlined into main with (cost=0, threshold=15337) (hotness: 300) target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-scei-ps4" Index: test/LTO/Resolution/X86/diagnostic-handler-remarks.ll =================================================================== --- test/LTO/Resolution/X86/diagnostic-handler-remarks.ll +++ test/LTO/Resolution/X86/diagnostic-handler-remarks.ll @@ -30,9 +30,9 @@ ; YAML-NEXT: - Caller: main ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' -; YAML-NEXT: - Cost: '-15000' +; YAML-NEXT: - Cost: '0' ; YAML-NEXT: - String: ', threshold=' -; YAML-NEXT: - Threshold: '337' +; YAML-NEXT: - Threshold: '15337' ; YAML-NEXT: - String: ')' ; YAML-NEXT: ... Index: test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll =================================================================== --- test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll +++ test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll @@ -19,9 +19,9 @@ ; YAML-NEXT: - Caller: main ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' -; YAML-NEXT: - Cost: '-15000' +; YAML-NEXT: - Cost: '0' ; YAML-NEXT: - String: ', threshold=' -; YAML-NEXT: - Threshold: '337' +; YAML-NEXT: - Threshold: '15337' ; YAML-NEXT: - String: ')' ; YAML-NEXT: ... Index: test/LTO/X86/diagnostic-handler-remarks.ll =================================================================== --- test/LTO/X86/diagnostic-handler-remarks.ll +++ test/LTO/X86/diagnostic-handler-remarks.ll @@ -55,9 +55,9 @@ ; YAML-NEXT: - Caller: main ; YAML-NEXT: - String: ' with ' ; YAML-NEXT: - String: '(cost=' -; YAML-NEXT: - Cost: '-15000' +; YAML-NEXT: - Cost: '0' ; YAML-NEXT: - String: ', threshold=' -; YAML-NEXT: - Threshold: '337' +; YAML-NEXT: - Threshold: '15337' ; YAML-NEXT: - String: ')' ; YAML-NEXT: ... Index: test/Transforms/Inline/ARM/inline-fp.ll =================================================================== --- test/Transforms/Inline/ARM/inline-fp.ll +++ test/Transforms/Inline/ARM/inline-fp.ll @@ -7,25 +7,25 @@ ; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75) ; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75) ; NOFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=75) -; NOFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15015, threshold=75) +; NOFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=15075) ; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) ; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) ; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) ; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) ; FULLFP-DAG: single inlined into test_single with (cost=0, threshold=75) -; FULLFP-DAG: single inlined into test_single with (cost=-15000, threshold=75) +; FULLFP-DAG: single inlined into test_single with (cost=0, threshold=15075) ; FULLFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=75) -; FULLFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15015, threshold=75) +; FULLFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=15075) ; FULLFP-DAG: double inlined into test_double with (cost=0, threshold=75) -; FULLFP-DAG: double inlined into test_double with (cost=-15000, threshold=75) +; FULLFP-DAG: double inlined into test_double with (cost=0, threshold=15075) ; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) ; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) ; SINGLEFP-DAG: single inlined into test_single with (cost=0, threshold=75) -; SINGLEFP-DAG: single inlined into test_single with (cost=-15000, threshold=75) +; SINGLEFP-DAG: single inlined into test_single with (cost=0, threshold=15075) ; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=75) -; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15015, threshold=75) +; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=15075) ; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) ; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) ; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)