Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -66,6 +66,30 @@
                          cl::ZeroOrMore,
                          cl::desc("Threshold for hot callsites "));
 
+// The following options control the heuristic that estimates speedup due to
+// inlining and adds a threshold bonus when the speedup is above a certain
+// limit.
+//
+/// Threshold bonus to apply when inlining is expected to result in a speedup.
+/// The bonus is expressed as a percent value. The unbonused threshold is
+/// multiplied by this threshold to arrive at the threshold increase.
+static cl::opt<int>
+    SpeedupBonusPercent("speedup-bonus-percent", cl::Hidden, cl::init(200),
+                        cl::ZeroOrMore,
+                        cl::desc("Bonus for callees showing speedup"));
+/// Minimum estimated speedup required to apply a threshold bonus.
+static cl::opt<int>
+    MinSpeedupForBonus("min-speedup-for-bonus", cl::Hidden, cl::init(10),
+                       cl::ZeroOrMore,
+                       cl::desc("Speedup percentage to apply bonus"));
+
+/// Minimum block frequency of callsite to apply speedup-based bonus.
+/// This limits size increase due to speedup bonus by applying the heuristic
+/// only when the callsite is relatively hot compared to caller's entry.
+static cl::opt<unsigned> MinBFForSpeedupBonus(
+    "min-freq-for-speedup-bonus", cl::Hidden, cl::init(8), cl::ZeroOrMore,
+    cl::desc("Min relative callsite freq to apply speedup bonus"));
+
 namespace {
 
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -95,6 +119,7 @@
   /// Tunable parameters that control the analysis.
   const InlineParams &Params;
 
+  BlockFrequencyInfo *CallerBFI, *CalleeBFI;
   int Threshold;
   int Cost;
 
@@ -126,10 +151,13 @@
   /// allocas on the caller stack which could be simplified through SROA.
   DenseMap<Value *, Value *> SROAArgValues;
 
+  // We track unweighted and weighted(by block frequency) SROA cost savings.
+  using SROACostTy = std::pair<int, uint64_t>;
+
   /// The mapping of caller Alloca values to their accumulated cost savings. If
   /// we have to disable SROA for one of the allocas, this tells us how much
   /// cost must be added.
-  DenseMap<Value *, int> SROAArgCosts;
+  DenseMap<Value *, SROACostTy> SROAArgCosts;
 
   /// Keep track of values which map to a pointer base and constant offset.
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
@@ -137,11 +165,13 @@
   // Custom simplification helper routines.
   bool isAllocaDerivedArg(Value *V);
   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
-                            DenseMap<Value *, int>::iterator &CostIt);
-  void disableSROA(DenseMap<Value *, int>::iterator CostIt);
+                            DenseMap<Value *, SROACostTy>::iterator &CostIt);
+  void disableSROA(DenseMap<Value *, SROACostTy>::iterator CostIt);
   void disableSROA(Value *V);
-  void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+  void accumulateSROACost(DenseMap<Value *, SROACostTy>::iterator CostIt,
                           int InstructionCost);
+  void accumulateCost(int InstructionCost);
+  void accumulateSavings(int Savings = InlineConstants::InstrCost);
   bool isGEPFree(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
@@ -213,6 +243,8 @@
   bool
   simplifyInstruction(Instruction &I,
                       function_ref<Constant *(ArrayRef<Constant *>)> Evaluate);
+  int getSpeedupBonus(CallSite &CS, int Threshold);
+  bool hasEstimatedSpeedup();
 
 public:
   CallAnalyzer(const TargetTransformInfo &TTI,
@@ -230,8 +262,8 @@
         FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-        NumInstructionsSimplified(0), SROACostSavings(0),
-        SROACostSavingsLost(0) {}
+        NumInstructionsSimplified(0), SROACostSavings(0), WeightedSavings(0),
+        WeightedCost(0), SROACostSavingsLost(0), CurrBBFreq(1) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -247,7 +279,10 @@
   unsigned NumConstantPtrDiffs;
   unsigned NumInstructionsSimplified;
   unsigned SROACostSavings;
+  int WeightedSavings, WeightedCost;
   unsigned SROACostSavingsLost;
+  /// The block frequency of the current block being analyzed.
+  uint64_t CurrBBFreq;
 
   void dump();
 };
@@ -262,7 +297,7 @@
 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
 /// Returns false if V does not map to a SROA-candidate.
 bool CallAnalyzer::lookupSROAArgAndCost(
-    Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
+    Value *V, Value *&Arg, DenseMap<Value *, SROACostTy>::iterator &CostIt) {
   if (SROAArgValues.empty() || SROAArgCosts.empty())
     return false;
 
@@ -279,28 +314,37 @@
 ///
 /// This marks the candidate as no longer viable for SROA, and adds the cost
 /// savings associated with it back into the inline cost measurement.
-void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
+void CallAnalyzer::disableSROA(DenseMap<Value *, SROACostTy>::iterator CostIt) {
   // If we're no longer able to perform SROA we need to undo its cost savings
   // and prevent subsequent analysis.
-  Cost += CostIt->second;
-  SROACostSavings -= CostIt->second;
-  SROACostSavingsLost += CostIt->second;
+  Cost += CostIt->second.first;
+  WeightedCost += CostIt->second.second;
+  SROACostSavings -= CostIt->second.first;
+  WeightedSavings -= CostIt->second.second;
+  SROACostSavingsLost += CostIt->second.first;
   SROAArgCosts.erase(CostIt);
 }
 
 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
 void CallAnalyzer::disableSROA(Value *V) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
     disableSROA(CostIt);
 }
 
 /// \brief Accumulate the given cost for a particular SROA candidate.
-void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
-                                      int InstructionCost) {
-  CostIt->second += InstructionCost;
+void CallAnalyzer::accumulateSROACost(
+    DenseMap<Value *, SROACostTy>::iterator CostIt, int InstructionCost) {
+  CostIt->second.first += InstructionCost;
   SROACostSavings += InstructionCost;
+  // FIXME: We use saturating multiply on uint64_t here and below where we
+  // compute weighted cost/savings. If this proves to be less precise, consider
+  // using 128 bit APInt and also use relative block frequency (scale block
+  // frequencies relative to entry block).
+  auto WeightedCost = SaturatingMultiply(CurrBBFreq, (uint64_t)InstructionCost);
+  CostIt->second.second += WeightedCost;
+  WeightedSavings += WeightedCost;
 }
 
 /// \brief Accumulate a constant GEP offset into an APInt if possible.
@@ -398,9 +442,14 @@
   return true;
 }
 
+void CallAnalyzer::accumulateCost(int InstructionCost) {
+  Cost += InstructionCost;
+  WeightedCost += SaturatingMultiply(CurrBBFreq, (uint64_t)InstructionCost);
+}
+
 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   bool SROACandidate =
       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
 
@@ -417,6 +466,10 @@
         // Non-constant GEPs aren't folded, and disable SROA.
         if (SROACandidate)
           disableSROA(CostIt);
+        // isGEPFree looks up SimplifiedValues and so we should ideally be
+        // tracking savings here, but that would require calling the TTI hook
+        // with and without simplified values. So we take the conservative route
+        // and assume no savings due to the lookup.
         return isGEPFree(I);
       }
 
@@ -432,17 +485,27 @@
     }
   }
 
+  bool SVLookup = false;
+
   // Lambda to check whether a GEP's indices are all constant.
   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
-    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-      if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
-        return false;
+    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E;
+         ++I) {
+      if (!isa<Constant>(*I)) {
+        if (SimplifiedValues.lookup(*I))
+          SVLookup = true;
+        else
+          return false;
+      }
+    }
     return true;
   };
 
   if (IsGEPOffsetConstant(I)) {
     if (SROACandidate)
       SROAArgValues[&I] = SROAArg;
+    if (SVLookup)
+      accumulateSavings();
 
     // Constant GEPs are modeled as free.
     return true;
@@ -454,25 +517,51 @@
   return isGEPFree(I);
 }
 
+void CallAnalyzer::accumulateSavings(int Savings) {
+  WeightedSavings += SaturatingMultiply(CurrBBFreq, (uint64_t)Savings);
+}
+
 bool CallAnalyzer::simplifyInstruction(
     Instruction &I, function_ref<Constant *(ArrayRef<Constant *>)> Evaluate) {
   Constant *COps[2];
   int N = 0;
+  bool Lookup = false;
   for (Value *Op : I.operands()) {
     Constant *COp = dyn_cast<Constant>(Op);
-    if (!COp)
+    if (!COp) {
       COp = SimplifiedValues.lookup(Op);
+      Lookup = true;
+    }
     if (!COp)
       return false;
     COps[N++] = COp;
   }
   if (auto *C = Evaluate(COps)) {
     SimplifiedValues[&I] = C;
+    if (Lookup)
+      accumulateSavings();
     return true;
   }
   return false;
 }
 
+int CallAnalyzer::getSpeedupBonus(CallSite &CS, int Threshold) {
+  if (!CallerBFI || !CalleeBFI)
+    return 0;
+  auto EntryFreq = CallerBFI->getEntryFreq();
+  auto *BB = CS.getInstruction()->getParent();
+  auto CallSiteFreq = CallerBFI->getBlockFreq(BB).getFrequency();
+  if (CallSiteFreq / EntryFreq < MinBFForSpeedupBonus)
+    return 0;
+  return SpeedupBonusPercent * Threshold / 100;
+}
+
+bool CallAnalyzer::hasEstimatedSpeedup() {
+  int D = std::max(1, WeightedCost + WeightedSavings);
+  int Speedup = WeightedSavings * 100 / D;
+  return Speedup > MinSpeedupForBonus;
+}
+
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
   auto Evaluate = [&](ArrayRef<Constant *> COps) -> Constant * {
@@ -490,7 +579,7 @@
 
   // Also look for SROA candidates here.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
@@ -525,7 +614,7 @@
   // preserved either cannot fire on an integer, or won't in-and-of themselves
   // disable SROA (ext) w/o some later use that we would see and disable.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
@@ -553,7 +642,7 @@
 
   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
@@ -674,7 +763,6 @@
     if (Callee.hasFnAttribute(Attribute::InlineHint))
       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
     if (PSI) {
-      BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
       if (PSI->isHotCallSite(CS, CallerBFI)) {
         DEBUG(dbgs() << "Hot callsite.\n");
         Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
@@ -724,6 +812,7 @@
       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+        accumulateSavings();
         SimplifiedValues[&I] = C;
         ++NumConstantPtrCmps;
         return true;
@@ -735,6 +824,8 @@
   // if we know the value (argument) can't be null
   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
       isKnownNonNullInCallee(I.getOperand(0))) {
+    if (isAllocaDerivedArg(I.getOperand(0)))
+      accumulateSavings();
     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
                                       : ConstantInt::getFalse(I.getType());
@@ -742,7 +833,7 @@
   }
   // Finally check for SROA candidates in comparisons.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
     if (isa<ConstantPointerNull>(I.getOperand(1))) {
       accumulateSROACost(CostIt, InlineConstants::InstrCost);
@@ -770,6 +861,7 @@
       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
+        accumulateSavings();
         SimplifiedValues[&I] = C;
         ++NumConstantPtrDiffs;
         return true;
@@ -807,7 +899,7 @@
 
 bool CallAnalyzer::visitLoad(LoadInst &I) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
     if (I.isSimple()) {
       accumulateSROACost(CostIt, InlineConstants::InstrCost);
@@ -822,7 +914,7 @@
 
 bool CallAnalyzer::visitStore(StoreInst &I) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
     if (I.isSimple()) {
       accumulateSROACost(CostIt, InlineConstants::InstrCost);
@@ -874,15 +966,17 @@
   // inside of instsimplify, directly constant fold calls here.
   if (!canConstantFoldCallTo(F))
     return false;
-
+  bool SVLookup = false;
   // Try to re-map the arguments to constants.
   SmallVector<Constant *, 4> ConstantArgs;
   ConstantArgs.reserve(CS.arg_size());
   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
        ++I) {
     Constant *C = dyn_cast<Constant>(*I);
-    if (!C)
+    if (!C) {
       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
+      SVLookup = true;
+    }
     if (!C)
       return false; // This argument doesn't map to a constant.
 
@@ -890,6 +984,12 @@
   }
   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
     SimplifiedValues[CS.getInstruction()] = C;
+
+    // FIXME: Increase the savings associated with simplifying a callsite.
+    if (SVLookup)
+      accumulateSavings(InlineConstants::InstrCost +
+                        InlineConstants::CallPenalty);
+
     return true;
   }
 
@@ -920,7 +1020,7 @@
 
       case Intrinsic::load_relative:
         // This is normally lowered to 4 LLVM instructions.
-        Cost += 3 * InlineConstants::InstrCost;
+        accumulateCost(3 * InlineConstants::InstrCost);
         return false;
 
       case Intrinsic::memset:
@@ -944,12 +1044,12 @@
     if (TTI.isLoweredToCall(F)) {
       // We account for the average 1 instruction per call argument setup
       // here.
-      Cost += CS.arg_size() * InlineConstants::InstrCost;
+      accumulateCost(CS.arg_size() * InlineConstants::InstrCost);
 
       // Everything other than inline ASM will also have a significant cost
       // merely from making the call.
       if (!isa<InlineAsm>(CS.getCalledValue()))
-        Cost += InlineConstants::CallPenalty;
+        accumulateCost(InlineConstants::CallPenalty);
     }
 
     return Base::visitCallSite(CS);
@@ -961,7 +1061,7 @@
 
   // First, pay the price of the argument setup. We account for the average
   // 1 instruction per call argument setup here.
-  Cost += CS.arg_size() * InlineConstants::InstrCost;
+  accumulateCost(CS.arg_size() * InlineConstants::InstrCost);
 
   // Next, check if this happens to be an indirect function call to a known
   // function in this inline context. If not, we've done all we can.
@@ -982,6 +1082,12 @@
     // We were able to inline the indirect call! Subtract the cost from the
     // threshold to get the bonus we want to apply, but don't go below zero.
     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+    // FIXME: This underestimates the savings due to removing a call. Perhaps we
+    // should get the weighted cost and savings of inlining the indirect call,
+    // scale it based on callsite and callee's enrtry frequencies and add them
+    // up to the current callee's weighted cost and savings.
+    accumulateSavings(InlineConstants::InstrCost +
+                      InlineConstants::CallPenalty);
   }
 
   return Base::visitCallSite(CS);
@@ -999,9 +1105,14 @@
   // shouldn't exist at all, but handling them makes the behavior of the
   // inliner more regular and predictable. Interestingly, conditional branches
   // which will fold away are also free.
-  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
-         dyn_cast_or_null<ConstantInt>(
-             SimplifiedValues.lookup(BI.getCondition()));
+  if (BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()))
+    return true;
+  if (dyn_cast_or_null<ConstantInt>(
+          SimplifiedValues.lookup(BI.getCondition()))) {
+    accumulateSavings();
+    return true;
+  }
+  return false;
 }
 
 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
@@ -1032,10 +1143,12 @@
   };
 
   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
-    if (isa<ConstantInt>(V))
+    if (isa<ConstantInt>(V)) {
+      accumulateSavings(SwitchInstCost(SI));
       return true;
+    }
 
-  Cost += SwitchInstCost(SI);
+  accumulateCost(SwitchInstCost(SI));
   return false;
 }
 
@@ -1100,6 +1213,9 @@
 /// viable, and true if inlining remains viable.
 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
                                 SmallPtrSetImpl<const Value *> &EphValues) {
+  if (CalleeBFI)
+    CurrBBFreq = CalleeBFI->getBlockFreq(BB).getFrequency();
+
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     // FIXME: Currently, the number of instructions in a function regardless of
     // our ability to simplify them during inline to constants or dead code,
@@ -1136,7 +1252,7 @@
 
       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
           hasSoftFloatAttr)
-        Cost += InlineConstants::CallPenalty;
+        accumulateCost(InlineConstants::CallPenalty);
     }
 
     // If the instruction simplified to a constant, there is no cost to this
@@ -1147,7 +1263,7 @@
     if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
-      Cost += InlineConstants::InstrCost;
+      accumulateCost(InlineConstants::InstrCost);
 
     // If the visit this instruction detected an uninlinable pattern, abort.
     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
@@ -1232,6 +1348,17 @@
   assert(NumInstructions == 0);
   assert(NumVectorInstructions == 0);
 
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  CallerBFI = nullptr;
+  CalleeBFI = nullptr;
+  if (GetBFI) {
+    CallerBFI = &((*GetBFI)(*Caller));
+    if (!F.isDeclaration())
+      CalleeBFI = &((*GetBFI)(F));
+    // While evaluating the weighted savings due to removal of argument setup
+    // and the call overhead, we want to use the entry block's frequency.
+    CurrBBFreq = CalleeBFI->getEntryFreq();
+  }
   // Update the threshold based on callsite properties
   updateThreshold(CS, F);
 
@@ -1244,11 +1371,12 @@
   // threshold by 50% until we pass the single-BB phase.
   bool SingleBB = true;
   int SingleBBBonus = Threshold / 2;
+  int SpeedupBonus = getSpeedupBonus(CS, Threshold);
 
   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
   // this Threshold any time, and cost cannot decrease, we can stop processing
   // the rest of the function body.
-  Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
+  Threshold += (SingleBBBonus + FiftyPercentVectorBonus + SpeedupBonus);
 
   // Give out bonuses per argument, as the instructions setting them up will
   // be gone after inlining.
@@ -1270,16 +1398,19 @@
       // DataLayout.
       NumStores = std::min(NumStores, 8U);
 
-      Cost -= 2 * NumStores * InlineConstants::InstrCost;
+      accumulateCost(-2 * NumStores * InlineConstants::InstrCost);
+      accumulateSavings(2 * NumStores * InlineConstants::InstrCost);
     } else {
       // For non-byval arguments subtract off one instruction per call
       // argument.
-      Cost -= InlineConstants::InstrCost;
+      accumulateCost(-InlineConstants::InstrCost);
+      accumulateSavings(InlineConstants::InstrCost);
     }
   }
   // The call instruction also disappears after inlining.
-  Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty;
-  
+  accumulateCost(-InlineConstants::InstrCost - InlineConstants::CallPenalty);
+  accumulateSavings(InlineConstants::InstrCost + InlineConstants::CallPenalty);
+
   // If there is only one call of the function, and it has internal linkage,
   // the cost of inlining it drops dramatically.
   bool OnlyOneCallAndLocalLinkage =
@@ -1290,7 +1421,7 @@
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
   if (F.getCallingConv() == CallingConv::Cold)
-    Cost += InlineConstants::ColdccPenalty;
+    accumulateCost(InlineConstants::ColdccPenalty);
 
   // Check if we're done. This can happen due to bonuses and penalties.
   if (Cost > Threshold)
@@ -1299,7 +1430,6 @@
   if (F.empty())
     return true;
 
-  Function *Caller = CS.getInstruction()->getParent()->getParent();
   // Check if the caller function is recursive itself.
   for (User *U : Caller->users()) {
     CallSite Site(U);
@@ -1328,7 +1458,7 @@
       // We can SROA any pointer arguments derived from alloca instructions.
       if (isa<AllocaInst>(PtrArg)) {
         SROAArgValues[&*FAI] = PtrArg;
-        SROAArgCosts[PtrArg] = 0;
+        SROAArgCosts[PtrArg] = {0, 0};
       }
     }
   }
@@ -1432,6 +1562,9 @@
   else if (NumVectorInstructions <= NumInstructions / 2)
     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
 
+  if (!hasEstimatedSpeedup())
+    Threshold -= SpeedupBonus;
+
   return Cost < std::max(1, Threshold);
 }
 
@@ -1450,6 +1583,8 @@
   DEBUG_PRINT_STAT(SROACostSavingsLost);
   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
   DEBUG_PRINT_STAT(Cost);
+  DEBUG_PRINT_STAT(WeightedCost);
+  DEBUG_PRINT_STAT(WeightedSavings);
   DEBUG_PRINT_STAT(Threshold);
 #undef DEBUG_PRINT_STAT
 }
Index: test/Transforms/Inline/speedup-analysis.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/speedup-analysis.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=10 -speedup-bonus-percent=1000
+; Test that a callee that does not fit within the threshold gets inlined
+; because of the estimated speedup heuristic.
+define i32 @caller(i32 %n) {
+; CHECK-LABEL: define i32 @caller
+entry:
+  br label %loop
+loop:
+  %r = phi i32 [%n, %entry], [%result, %loop]
+; CHECK-NOT: call i32 @callee
+  %result = call i32 @callee(i32 %r)
+  %cond = icmp sle i32 %result, 100
+  br i1 %cond, label %loop, label %exit
+exit:
+; CHECK: ret
+  ret i32 %result
+}
+
+define i32 @callee(i32 %n) {
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false, !prof !0
+
+cond_true:
+  %n1 = add i32 %n, 1
+  %n2 = add i32 %n1, 1
+  ret i32 %n2
+cond_false:
+  call void @extern()
+  call void @extern()
+  call void @extern()
+  ret i32 0
+}
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 1, i32 0}
Index: test/Transforms/Inline/speedup-analysis2.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/speedup-analysis2.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=10 -speedup-bonus-percent=1000
+; Test that a callee that does not fit within the threshold gets inlined
+; because of the estimated speedup heuristic. The callee has a switch statement.
+; Since only one of the cases is executed per invocation of the callee, the
+; weighted cost of the callee is low and results in a big relative speedup as the
+; benefits of removing the function call is accounted for.
+
+define i32 @caller(i32 %n) {
+; CHECK-LABEL: define i32 @caller
+entry:
+  br label %loop
+loop:
+  %r = phi i32 [%n, %entry], [%result, %loop]
+; CHECK-NOT: call i32 @callee
+  %result = call i32 @callee(i32 %r)
+  %cond = icmp sle i32 %result, 100
+  br i1 %cond, label %loop, label %exit
+exit:
+; CHECK: ret
+  ret i32 %result
+}
+
+define i32 @callee(i32 %n) {
+entry:
+  switch i32 %n, label %return [
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+    i32 7, label %sw.bb7
+  ]
+
+sw.bb1:
+  %r1 = add i32 %n, 1
+  br label %return
+
+sw.bb2:
+  %r2 = add i32 %n, 2
+  br label %return
+
+sw.bb3:
+  %r3 = add i32 %n, 3
+  br label %return
+
+sw.bb4:
+  %r4 = add i32 %n, 4
+  br label %return
+
+sw.bb5:
+  %r5 = add i32 %n, 5
+  br label %return
+
+sw.bb6:
+  %r6 = add i32 %n, 6
+  br label %return
+
+sw.bb7:
+  %r7 = add i32 %n, 7
+  br label %return
+
+return:
+  %res = phi i32 [%n, %entry], [%r1, %sw.bb1], [%r2, %sw.bb2], [%r3, %sw.bb3], [%r4, %sw.bb4], [%r5, %sw.bb5], [%r6, %sw.bb6], [%r7, %sw.bb7]
+  ret i32 %res
+
+}
+declare void @extern()
+