Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -126,8 +126,16 @@
   /// Tunable parameters that control the analysis.
   const InlineParams &Params;
 
+  /// Upper bound for the inlining cost. Bonuses are being applied to account
+  /// for speculative "expected profit" of the inlining decision.
   int Threshold;
-  int Cost;
+
+  /// Inlining cost measured in abstract units, accounts for all the
+  /// instructions expected to be executed for a given function invocation.
+  /// Instructions that are statically proven to be dead based on call-site
+  /// arguments are not counted here.
+  int Cost = 0;
+
   bool ComputeFullInlineCost;
 
   bool IsCallerRecursive;
@@ -143,9 +151,23 @@
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
   unsigned NumInstructions, NumVectorInstructions;
-  int VectorBonus, TenPercentVectorBonus;
-  // Bonus to be applied when the callee has only one reachable basic block.
-  int SingleBBBonus;
+
+  /// Bonus to be applied when percentage of vector instructions in callee is
+  /// high (see more details in updateThreshold).
+  int VectorBonus = 0;
+  /// Bonus to be applied when the callee has only one reachable basic block.
+  int SingleBBBonus = 0;
+
+  /// If there is only one call of the function, and it has internal linkage,
+  /// we can allow to inline substantially more.
+  int LastCallToStaticBonus = 0;
+
+  /// When we are able to prove that inlining the callee will make inlining
+  /// of a particular nested call-site profitable, we want to give a certain
+  /// bonus with size of a callee deducted.
+  /// Tracking the actually applied bonus here, as well as adding it directly
+  /// to Threshold.
+  int InlinedCallBonus = 0;
 
   /// While we walk the potentially-inlined instructions, we build up and
   /// maintain a mapping of simplified values specific to this callsite. The
@@ -270,6 +292,16 @@
   bool visitCatchReturnInst(CatchReturnInst &RI);
   bool visitUnreachableInst(UnreachableInst &I);
 
+  void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
+    Cost = std::min(UpperBound, Cost + Inc);
+  }
+
+  void applyInlinedCallBonus(int Bonus) {
+    assert(Bonus >= 0);
+    Threshold += Bonus;
+    InlinedCallBonus += Bonus;
+  }
+
 public:
   CallAnalyzer(const TargetTransformInfo &TTI,
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
@@ -279,18 +311,17 @@
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
-        Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
-                                       Params.ComputeFullInlineCost || ORE),
+        ComputeFullInlineCost(OptComputeFullInlineCost ||
+                              Params.ComputeFullInlineCost || ORE),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
         HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0),
-        NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
-        SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
-        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
-        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-        NumInstructionsSimplified(0), SROACostSavings(0),
-        SROACostSavingsLost(0) {}
+        NumInstructions(0), NumVectorInstructions(0),
+        EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
+        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+        SROACostSavings(0), SROACostSavingsLost(0) {}
 
   InlineResult analyzeCall(CallSite CS);
 
@@ -341,7 +372,7 @@
 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
   // If we're no longer able to perform SROA we need to undo its cost savings
   // and prevent subsequent analysis.
-  Cost += CostIt->second;
+  addCost(CostIt->second);
   SROACostSavings -= CostIt->second;
   SROACostSavingsLost += CostIt->second;
   SROAArgCosts.erase(CostIt);
@@ -365,7 +396,7 @@
 
 void CallAnalyzer::disableLoadElimination() {
   if (EnableLoadElimination) {
-    Cost += LoadEliminationCost;
+    addCost(LoadEliminationCost);
     LoadEliminationCost = 0;
     EnableLoadElimination = false;
   }
@@ -720,7 +751,7 @@
   case Instruction::FPToUI:
   case Instruction::FPToSI:
     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
-      Cost += InlineConstants::CallPenalty;
+      addCost(InlineConstants::CallPenalty);
     break;
   default:
     break;
@@ -886,7 +917,15 @@
   // and the callsite.
   int SingleBBBonusPercent = 50;
   int VectorBonusPercent = 150;
-  int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
+
+  int LastCallToStaticBonus = 0;
+  bool OnlyOneCallAndLocalLinkage =
+      F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
+  // If there is only one call of the function, and it has internal linkage,
+  // we can allow to inline pretty anything as it will lead to size reduction
+  // anyway.
+  if (OnlyOneCallAndLocalLinkage)
+    LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
 
   // Lambda to set all the above bonus and bonus percentages to 0.
   auto DisallowAllBonuses = [&]() {
@@ -959,20 +998,13 @@
     }
   }
 
-  // Finally, take the target-specific inlining threshold multiplier into
-  // account.
+  // Take the target-specific inlining threshold multiplier into account.
   Threshold *= TTI.getInliningThresholdMultiplier();
 
   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
   VectorBonus = Threshold * VectorBonusPercent / 100;
 
-  bool OnlyOneCallAndLocalLinkage =
-      F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
-  // If there is only one call of the function, and it has internal linkage,
-  // the cost of inlining it drops dramatically. It may seem odd to update
-  // Cost in updateThreshold, but the bonus depends on the logic in this method.
-  if (OnlyOneCallAndLocalLinkage)
-    Cost -= LastCallToStaticBonus;
+  Threshold += LastCallToStaticBonus;
 }
 
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
@@ -1089,7 +1121,7 @@
   // as such.
   if (I.getType()->isFloatingPointTy() &&
       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
-    Cost += InlineConstants::CallPenalty;
+    addCost(InlineConstants::CallPenalty);
 
   return false;
 }
@@ -1228,7 +1260,7 @@
 
       case Intrinsic::load_relative:
         // This is normally lowered to 4 LLVM instructions.
-        Cost += 3 * InlineConstants::InstrCost;
+        addCost(3 * InlineConstants::InstrCost);
         return false;
 
       case Intrinsic::memset:
@@ -1257,12 +1289,12 @@
     if (TTI.isLoweredToCall(F)) {
       // We account for the average 1 instruction per call argument setup
       // here.
-      Cost += CS.arg_size() * InlineConstants::InstrCost;
+      addCost(CS.arg_size() * InlineConstants::InstrCost);
 
       // Everything other than inline ASM will also have a significant cost
       // merely from making the call.
       if (!isa<InlineAsm>(CS.getCalledValue()))
-        Cost += InlineConstants::CallPenalty;
+        addCost(InlineConstants::CallPenalty);
     }
 
     if (!CS.onlyReadsMemory())
@@ -1276,7 +1308,7 @@
 
   // First, pay the price of the argument setup. We account for the average
   // 1 instruction per call argument setup here.
-  Cost += CS.arg_size() * InlineConstants::InstrCost;
+  addCost(CS.arg_size() * InlineConstants::InstrCost);
 
   // Next, check if this happens to be an indirect function call to a known
   // function in this inline context. If not, we've done all we can.
@@ -1297,9 +1329,10 @@
   CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
                   IndirectCallParams);
   if (CA.analyzeCall(CS)) {
-    // We were able to inline the indirect call! Subtract the cost from the
-    // threshold to get the bonus we want to apply, but don't go below zero.
-    Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+    // We were able to inline the indirect call! Increase the threshold
+    // with the bonus we want to apply (less the cost of inlinee).
+    // Make sure the bonus doesn't go below zero.
+    applyInlinedCallBonus(std::max(0, CA.getThreshold() - CA.getCost()));
   }
 
   if (!F->onlyReadsMemory())
@@ -1438,7 +1471,7 @@
                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
 
   if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
-    Cost = CostLowerBound;
+    addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost);
     return false;
   }
 
@@ -1452,7 +1485,7 @@
     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
                      4 * InlineConstants::InstrCost;
 
-    Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
+    addCost(JTCost, (int64_t)CostUpperBound);
     return false;
   }
 
@@ -1473,7 +1506,7 @@
   //   n + n / 2 - 1 = n * 3 / 2 - 1
   if (NumCaseCluster <= 3) {
     // Suppose a comparison includes one compare and one conditional branch.
-    Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
+    addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
     return false;
   }
 
@@ -1481,7 +1514,7 @@
   int64_t SwitchCost =
       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
 
-  Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
+  addCost(SwitchCost, (int64_t)CostUpperBound);
   return false;
 }
 
@@ -1574,7 +1607,7 @@
     if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
-      Cost += InlineConstants::InstrCost;
+      addCost(InlineConstants::InstrCost);
 
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
@@ -1619,7 +1652,7 @@
       return IR;
     }
 
-    // Check if we've past the maximum possible threshold so we don't spin in
+    // Check if we've passed the maximum possible threshold so we don't spin in
     // huge basic blocks that will never inline.
     if (Cost >= Threshold && !ComputeFullInlineCost)
       return false;
@@ -1745,7 +1778,7 @@
 
   // Give out bonuses for the callsite, as the instructions setting them up
   // will be gone after inlining.
-  Cost -= getCallsiteCost(CS, DL);
+  addCost(-getCallsiteCost(CS, DL));
 
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
@@ -1909,7 +1942,7 @@
         continue;
       NumLoops++;
     }
-    Cost += NumLoops * InlineConstants::CallPenalty;
+    addCost(NumLoops * InlineConstants::CallPenalty);
   }
 
   // We applied the maximum possible vector bonus at the beginning. Now,
Index: test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll
===================================================================
--- test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll
+++ test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll
@@ -27,13 +27,13 @@
 ; YAML-NEXT:   - Caller:          main
 ; YAML-NEXT:   - String:          ' with '
 ; YAML-NEXT:   - String:          '(cost='
-; YAML-NEXT:   - Cost:            '-15000'
+; YAML-NEXT:   - Cost:            '0'
 ; YAML-NEXT:   - String:          ', threshold='
-; YAML-NEXT:   - Threshold:       '337'
+; YAML-NEXT:   - Threshold:       '15337'
 ; YAML-NEXT:   - String:          ')'
 ; YAML-NEXT: ...
 
-; CHECK: tinkywinky inlined into main with (cost=-15000, threshold=337) (hotness: 300)
+; CHECK: tinkywinky inlined into main with (cost=0, threshold=15337) (hotness: 300)
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-scei-ps4"
Index: test/LTO/Resolution/X86/diagnostic-handler-remarks.ll
===================================================================
--- test/LTO/Resolution/X86/diagnostic-handler-remarks.ll
+++ test/LTO/Resolution/X86/diagnostic-handler-remarks.ll
@@ -30,9 +30,9 @@
 ; YAML-NEXT:   - Caller:          main
 ; YAML-NEXT:   - String:          ' with '
 ; YAML-NEXT:   - String:          '(cost='
-; YAML-NEXT:   - Cost:            '-15000'
+; YAML-NEXT:   - Cost:            '0'
 ; YAML-NEXT:   - String:          ', threshold='
-; YAML-NEXT:   - Threshold:       '337'
+; YAML-NEXT:   - Threshold:       '15337'
 ; YAML-NEXT:   - String:          ')'
 ; YAML-NEXT: ...
 
Index: test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll
===================================================================
--- test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll
+++ test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll
@@ -19,9 +19,9 @@
 ; YAML-NEXT:   - Caller:          main
 ; YAML-NEXT:   - String:          ' with '
 ; YAML-NEXT:   - String:          '(cost='
-; YAML-NEXT:   - Cost:            '-15000'
+; YAML-NEXT:   - Cost:            '0'
 ; YAML-NEXT:   - String:          ', threshold='
-; YAML-NEXT:   - Threshold:       '337'
+; YAML-NEXT:   - Threshold:       '15337'
 ; YAML-NEXT:   - String:          ')'
 ; YAML-NEXT: ...
 
Index: test/LTO/X86/diagnostic-handler-remarks.ll
===================================================================
--- test/LTO/X86/diagnostic-handler-remarks.ll
+++ test/LTO/X86/diagnostic-handler-remarks.ll
@@ -55,9 +55,9 @@
 ; YAML-NEXT:   - Caller:          main
 ; YAML-NEXT:   - String:          ' with '
 ; YAML-NEXT:   - String:          '(cost='
-; YAML-NEXT:   - Cost:            '-15000'
+; YAML-NEXT:   - Cost:            '0'
 ; YAML-NEXT:   - String:          ', threshold='
-; YAML-NEXT:   - Threshold:       '337'
+; YAML-NEXT:   - Threshold:       '15337'
 ; YAML-NEXT:   - String:          ')'
 ; YAML-NEXT: ...
 
Index: test/Transforms/Inline/ARM/inline-fp.ll
===================================================================
--- test/Transforms/Inline/ARM/inline-fp.ll
+++ test/Transforms/Inline/ARM/inline-fp.ll
@@ -7,25 +7,25 @@
 ; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
 ; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75)
 ; NOFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=75)
-; NOFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15015, threshold=75)
+; NOFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=15075)
 ; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
 ; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
 ; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
 ; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
 
 ; FULLFP-DAG: single inlined into test_single with (cost=0, threshold=75)
-; FULLFP-DAG: single inlined into test_single with (cost=-15000, threshold=75)
+; FULLFP-DAG: single inlined into test_single with (cost=0, threshold=15075)
 ; FULLFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=75)
-; FULLFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15015, threshold=75)
+; FULLFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=15075)
 ; FULLFP-DAG: double inlined into test_double with (cost=0, threshold=75)
-; FULLFP-DAG: double inlined into test_double with (cost=-15000, threshold=75)
+; FULLFP-DAG: double inlined into test_double with (cost=0, threshold=15075)
 ; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
 ; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)
 
 ; SINGLEFP-DAG: single inlined into test_single with (cost=0, threshold=75)
-; SINGLEFP-DAG: single inlined into test_single with (cost=-15000, threshold=75)
+; SINGLEFP-DAG: single inlined into test_single with (cost=0, threshold=15075)
 ; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=75)
-; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15015, threshold=75)
+; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with (cost=-15, threshold=15075)
 ; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
 ; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75)
 ; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75)