Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -66,6 +66,41 @@
                          cl::ZeroOrMore,
                          cl::desc("Threshold for hot callsites "));
 
+// The following options control the heuristic that estimates speedup due to
+// inlining and adds a threshold bonus when the speedup is above a certain
+// limit. Speedup here refers to savings in weighted cost due to inlining
+// relative to the weighted cost of the uninlined callee.
+//
+/// Threshold bonus to apply when inlining is expected to result in a speedup.
+/// The bonus is expressed as a percent value. The unbonused threshold is
+/// multiplied by this threshold to arrive at the threshold increase.
+static cl::opt<int>
+    SpeedupBonusPercent("speedup-bonus-percent", cl::Hidden, cl::init(200),
+                        cl::ZeroOrMore,
+                        cl::desc("Bonus for callees showing speedup"));
+
+/// Minimum speedup percentage required to apply a bonus to the threshold.
+///
+/// Speedup is computed as the percentage of weighted cost savings if the callee
+/// is inlined.
+static cl::opt<int>
+    MinSpeedupForBonus("min-speedup-for-bonus", cl::Hidden, cl::init(10),
+                       cl::ZeroOrMore,
+                       cl::desc("Minimum weighted cost savings (in percentage) "
+                                "needed to apply the speedup bonus"));
+
+/// Minimum relative frequency of callsite to apply speedup-based bonus.
+///
+/// Speedup bonus is applied if the callsite is hot and not applied if it is
+/// cold as determined by profile summary. If the callsite is not known to be
+/// hot or cold, the bonus is applied only if the block frequency of the
+/// callsite relative to the caller's entry meets this ratio to limit size
+/// increase due to speedup bonus by applying the heuristic.
+static cl::opt<unsigned> MinBFRatioForSpeedupBonus(
+    "min-bf-ratio-for-speedup-bonus", cl::Hidden, cl::init(8), cl::ZeroOrMore,
+    cl::desc("Minimum ratio of callsite's block frequency to caller's entry "
+             "block frequency to apply the speedup bonus"));
+
 namespace {
 
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -95,6 +130,7 @@
   /// Tunable parameters that control the analysis.
   const InlineParams &Params;
 
+  BlockFrequencyInfo *CallerBFI, *CalleeBFI;
   int Threshold;
   int Cost;
 
@@ -126,22 +162,28 @@
   /// allocas on the caller stack which could be simplified through SROA.
   DenseMap<Value *, Value *> SROAArgValues;
 
+  // We track unweighted and weighted(by block frequency) SROA cost savings.
+  using SROACostTy = std::pair<int, uint64_t>;
+
   /// The mapping of caller Alloca values to their accumulated cost savings. If
   /// we have to disable SROA for one of the allocas, this tells us how much
   /// cost must be added.
-  DenseMap<Value *, int> SROAArgCosts;
+  DenseMap<Value *, SROACostTy> SROAArgCosts;
 
   /// Keep track of values which map to a pointer base and constant offset.
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
 
   // Custom simplification helper routines.
   bool isAllocaDerivedArg(Value *V);
+  int getArgPassingCost(CallSite CS, Function *Callee);
   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
-                            DenseMap<Value *, int>::iterator &CostIt);
-  void disableSROA(DenseMap<Value *, int>::iterator CostIt);
+                            DenseMap<Value *, SROACostTy>::iterator &CostIt);
+  void disableSROA(DenseMap<Value *, SROACostTy>::iterator CostIt);
   void disableSROA(Value *V);
-  void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+  void accumulateSROACost(DenseMap<Value *, SROACostTy>::iterator CostIt,
                           int InstructionCost);
+  void accumulateCost(int InstructionCost);
+  void accumulateSavings(int Savings);
   bool isGEPFree(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
@@ -209,6 +251,14 @@
   bool visitCleanupReturnInst(CleanupReturnInst &RI);
   bool visitCatchReturnInst(CatchReturnInst &RI);
   bool visitUnreachableInst(UnreachableInst &I);
+  /// Simplify \p I if its operands are constants and update SimplifiedValues.
+  /// Evaluate is a lambda specific to instruction type that evaluates the
+  /// instruction when all the operands are constants.
+  bool
+  simplifyInstruction(Instruction &I,
+                      function_ref<Constant *(ArrayRef<Constant *>)> Evaluate);
+  int getSpeedupBonus(CallSite &CS, int Threshold);
+  bool hasLargeSpeedup();
 
 public:
   CallAnalyzer(const TargetTransformInfo &TTI,
@@ -226,8 +276,8 @@
         FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-        NumInstructionsSimplified(0), SROACostSavings(0),
-        SROACostSavingsLost(0) {}
+        NumInstructionsSimplified(0), SROACostSavings(0), WeightedSavings(0),
+        WeightedCost(0), SROACostSavingsLost(0), CurrBBFreq(1) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -243,7 +293,10 @@
   unsigned NumConstantPtrDiffs;
   unsigned NumInstructionsSimplified;
   unsigned SROACostSavings;
+  int WeightedSavings, WeightedCost;
   unsigned SROACostSavingsLost;
+  /// The block frequency of the current block being analyzed.
+  uint64_t CurrBBFreq;
 
   void dump();
 };
@@ -255,10 +308,47 @@
   return SROAArgValues.count(V);
 }
 
+/// Return the cost of passing arguments to the callee at callsite \param CS.
+///
+/// This cost is negative since inlining would eliminate the instructions needed
+/// for setting up the arguments
+int CallAnalyzer::getArgPassingCost(CallSite CS, Function *Callee) {
+  // Give out bonuses per argument, as the instructions setting them up will
+  // be gone after inlining.
+  int Cost = 0;
+  const DataLayout &DL = Callee->getParent()->getDataLayout();
+  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+    if (CS.isByValArgument(I)) {
+      // We approximate the number of loads and stores needed by dividing the
+      // size of the byval type by the target's pointer size.
+      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = DL.getPointerSizeInBits();
+      // Ceiling division.
+      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+      // If it generates more than 8 stores it is likely to be expanded as an
+      // inline memcpy so we take that as an upper bound. Otherwise we assume
+      // one load and one store per word copied.
+      // FIXME: The maxStoresPerMemcpy setting from the target should be used
+      // here instead of a magic number of 8, but it's not available via
+      // DataLayout.
+      NumStores = std::min(NumStores, 8U);
+
+      Cost -= 2 * NumStores * InlineConstants::InstrCost;
+    } else {
+      // For non-byval arguments subtract off one instruction per call
+      // argument.
+      Cost -= InlineConstants::InstrCost;
+    }
+  }
+  return Cost;
+}
+
 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
 /// Returns false if V does not map to a SROA-candidate.
 bool CallAnalyzer::lookupSROAArgAndCost(
-    Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
+    Value *V, Value *&Arg, DenseMap<Value *, SROACostTy>::iterator &CostIt) {
   if (SROAArgValues.empty() || SROAArgCosts.empty())
     return false;
 
@@ -275,28 +365,37 @@
 ///
 /// This marks the candidate as no longer viable for SROA, and adds the cost
 /// savings associated with it back into the inline cost measurement.
-void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
+void CallAnalyzer::disableSROA(DenseMap<Value *, SROACostTy>::iterator CostIt) {
   // If we're no longer able to perform SROA we need to undo its cost savings
   // and prevent subsequent analysis.
-  Cost += CostIt->second;
-  SROACostSavings -= CostIt->second;
-  SROACostSavingsLost += CostIt->second;
+  Cost += CostIt->second.first;
+  WeightedCost += CostIt->second.second;
+  SROACostSavings -= CostIt->second.first;
+  WeightedSavings -= CostIt->second.second;
+  SROACostSavingsLost += CostIt->second.first;
   SROAArgCosts.erase(CostIt);
 }
 
 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
 void CallAnalyzer::disableSROA(Value *V) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
     disableSROA(CostIt);
 }
 
 /// \brief Accumulate the given cost for a particular SROA candidate.
-void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
-                                      int InstructionCost) {
-  CostIt->second += InstructionCost;
+void CallAnalyzer::accumulateSROACost(
+    DenseMap<Value *, SROACostTy>::iterator CostIt, int InstructionCost) {
+  CostIt->second.first += InstructionCost;
   SROACostSavings += InstructionCost;
+  // FIXME: We use saturating multiply on uint64_t here and below where we
+  // compute weighted cost/savings. If this proves to be less precise, consider
+  // using 128 bit APInt and also use relative block frequency (scale block
+  // frequencies relative to entry block).
+  auto WeightedCost = SaturatingMultiply(CurrBBFreq, (uint64_t)InstructionCost);
+  CostIt->second.second += WeightedCost;
+  WeightedSavings += WeightedCost;
 }
 
 /// \brief Accumulate a constant GEP offset into an APInt if possible.
@@ -394,9 +493,14 @@
   return true;
 }
 
+void CallAnalyzer::accumulateCost(int InstructionCost) {
+  Cost += InstructionCost;
+  WeightedCost += SaturatingMultiply(CurrBBFreq, (uint64_t)InstructionCost);
+}
+
 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   bool SROACandidate =
       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
 
@@ -413,6 +517,10 @@
         // Non-constant GEPs aren't folded, and disable SROA.
         if (SROACandidate)
           disableSROA(CostIt);
+        // isGEPFree looks up SimplifiedValues and so we should ideally be
+        // tracking savings here, but that would require calling the TTI hook
+        // with and without simplified values. So we take the conservative route
+        // and assume no savings due to the lookup.
         return isGEPFree(I);
       }
 
@@ -428,11 +536,21 @@
     }
   }
 
+
   // Lambda to check whether a GEP's indices are all constant.
   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
-    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-      if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
-        return false;
+    bool SVLookup = false;
+    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E;
+         ++I) {
+      if (!isa<Constant>(*I)) {
+        if (SimplifiedValues.lookup(*I))
+          SVLookup = true;
+        else
+          return false;
+      }
+    }
+    if (SVLookup)
+      accumulateSavings(InlineConstants::InstrCost);
     return true;
   };
 
@@ -450,16 +568,23 @@
   return isGEPFree(I);
 }
 
+void CallAnalyzer::accumulateSavings(int Savings) {
+  WeightedSavings += SaturatingMultiply(CurrBBFreq, (uint64_t)Savings);
+}
+
 /// Simplify \p I if its operands are constants and update SimplifiedValues.
 /// \p Evaluate is a callable specific to instruction type that evaluates the
 /// instruction when all the operands are constants.
 template <typename Callable>
 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
   SmallVector<Constant *, 2> COps;
+  bool SVLookup = false;
   for (Value *Op : I.operands()) {
     Constant *COp = dyn_cast<Constant>(Op);
-    if (!COp)
+    if (!COp) {
       COp = SimplifiedValues.lookup(Op);
+      SVLookup = true;
+    }
     if (!COp)
       return false;
     COps.push_back(COp);
@@ -468,9 +593,47 @@
   if (!C)
     return false;
   SimplifiedValues[&I] = C;
+  // If SVLookup is true, then the constness of at least one of the operands is
+  // inferred only after looking up the SimplifiedValues map. In other words,
+  // the constness of the operands (and consequently the result) cannot be
+  // inferred without inlining. Thus we account the savings associated with the
+  // elimination of this instruction to inlining
+  if (SVLookup)
+    accumulateSavings(InlineConstants::InstrCost);
   return true;
 }
 
+/// Get the bonus to be applied for callsites with significant speedup.
+int CallAnalyzer::getSpeedupBonus(CallSite &CS, int Threshold) {
+  if (!CallerBFI || !CalleeBFI)
+    return 0;
+  auto EntryFreq = CallerBFI->getEntryFreq();
+  auto *BB = CS.getInstruction()->getParent();
+  auto CallSiteFreq = CallerBFI->getBlockFreq(BB).getFrequency();
+  int Bonus = SpeedupBonusPercent * Threshold / 100;
+  // Always apply the bonus for hot callsites and never apply this for cold
+  // callsites.
+  if (PSI) {
+    if (PSI->isHotCallSite(CS, CallerBFI))
+      return Bonus;
+    if (PSI->isColdCallSite(CS, CallerBFI))
+      return 0;
+  }
+  // In the absence of profile summary or when the callsite is neither hot nor
+  // cold, apply the bonus if callsite's frequency exceeds
+  // MinBFRatioForSpeedupBonus.
+  if (CallSiteFreq / EntryFreq >= MinBFRatioForSpeedupBonus)
+    return Bonus;
+  return 0;
+}
+
+/// Return true if the estimated speedup is large enough to apply bonus.
+bool CallAnalyzer::hasLargeSpeedup() {
+  int D = std::max(1, WeightedCost + WeightedSavings);
+  int Speedup = WeightedSavings * 100 / D;
+  return Speedup > MinSpeedupForBonus;
+}
+
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
@@ -487,7 +650,7 @@
 
   // Also look for SROA candidates here.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
@@ -521,7 +684,7 @@
   // preserved either cannot fire on an integer, or won't in-and-of themselves
   // disable SROA (ext) w/o some later use that we would see and disable.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
@@ -548,7 +711,7 @@
 
   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
@@ -667,7 +830,6 @@
     if (Callee.hasFnAttribute(Attribute::InlineHint))
       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
     if (PSI) {
-      BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
       if (PSI->isHotCallSite(CS, CallerBFI)) {
         DEBUG(dbgs() << "Hot callsite.\n");
         Threshold = Params.HotCallSiteThreshold.getValue();
@@ -716,6 +878,7 @@
       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+        accumulateSavings(InlineConstants::InstrCost);
         SimplifiedValues[&I] = C;
         ++NumConstantPtrCmps;
         return true;
@@ -727,6 +890,8 @@
   // if we know the value (argument) can't be null
   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
       isKnownNonNullInCallee(I.getOperand(0))) {
+    if (isAllocaDerivedArg(I.getOperand(0)))
+      accumulateSavings(InlineConstants::InstrCost);
     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
                                       : ConstantInt::getFalse(I.getType());
@@ -734,7 +899,7 @@
   }
   // Finally check for SROA candidates in comparisons.
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
     if (isa<ConstantPointerNull>(I.getOperand(1))) {
       accumulateSROACost(CostIt, InlineConstants::InstrCost);
@@ -762,6 +927,7 @@
       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
+        accumulateSavings(InlineConstants::InstrCost);
         SimplifiedValues[&I] = C;
         ++NumConstantPtrDiffs;
         return true;
@@ -799,7 +965,7 @@
 
 bool CallAnalyzer::visitLoad(LoadInst &I) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
     if (I.isSimple()) {
       accumulateSROACost(CostIt, InlineConstants::InstrCost);
@@ -814,7 +980,7 @@
 
 bool CallAnalyzer::visitStore(StoreInst &I) {
   Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
+  DenseMap<Value *, SROACostTy>::iterator CostIt;
   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
     if (I.isSimple()) {
       accumulateSROACost(CostIt, InlineConstants::InstrCost);
@@ -864,15 +1030,17 @@
   // inside of instsimplify, directly constant fold calls here.
   if (!canConstantFoldCallTo(F))
     return false;
-
+  bool SVLookup = false;
   // Try to re-map the arguments to constants.
   SmallVector<Constant *, 4> ConstantArgs;
   ConstantArgs.reserve(CS.arg_size());
   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
        ++I) {
     Constant *C = dyn_cast<Constant>(*I);
-    if (!C)
+    if (!C) {
       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
+      SVLookup = true;
+    }
     if (!C)
       return false; // This argument doesn't map to a constant.
 
@@ -880,6 +1048,19 @@
   }
   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
     SimplifiedValues[CS.getInstruction()] = C;
+
+    // If SVLookup is true, then the constness of at least one argument is
+    // inferred only after looking up the SimplifiedValues map. In other words,
+    // it is not possible to infer the the constness of all the arguments (and
+    // hence the result of the call due to constant folding) without inlining.
+    // Thus we account the savings associated with the elimination of this
+    // callsite to inlining.
+    //
+    // FIXME: Increase the savings associated with simplifying a callsite.
+    if (SVLookup)
+      accumulateSavings(InlineConstants::InstrCost +
+                        InlineConstants::CallPenalty);
+
     return true;
   }
 
@@ -910,7 +1091,7 @@
 
       case Intrinsic::load_relative:
         // This is normally lowered to 4 LLVM instructions.
-        Cost += 3 * InlineConstants::InstrCost;
+        accumulateCost(3 * InlineConstants::InstrCost);
         return false;
 
       case Intrinsic::memset:
@@ -934,12 +1115,12 @@
     if (TTI.isLoweredToCall(F)) {
       // We account for the average 1 instruction per call argument setup
       // here.
-      Cost += CS.arg_size() * InlineConstants::InstrCost;
+      accumulateCost(CS.arg_size() * InlineConstants::InstrCost);
 
       // Everything other than inline ASM will also have a significant cost
       // merely from making the call.
       if (!isa<InlineAsm>(CS.getCalledValue()))
-        Cost += InlineConstants::CallPenalty;
+        accumulateCost(InlineConstants::CallPenalty);
     }
 
     return Base::visitCallSite(CS);
@@ -951,7 +1132,7 @@
 
   // First, pay the price of the argument setup. We account for the average
   // 1 instruction per call argument setup here.
-  Cost += CS.arg_size() * InlineConstants::InstrCost;
+  accumulateCost(CS.arg_size() * InlineConstants::InstrCost);
 
   // Next, check if this happens to be an indirect function call to a known
   // function in this inline context. If not, we've done all we can.
@@ -972,6 +1153,13 @@
     // We were able to inline the indirect call! Subtract the cost from the
     // threshold to get the bonus we want to apply, but don't go below zero.
     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+    // FIXME: This underestimates the savings due to removing a call. Perhaps we
+    // should get the weighted cost and savings of inlining the indirect call,
+    // scale it based on callsite and callee's enrtry frequencies and add them
+    // up to the current callee's weighted cost and savings.
+    int ArgPassingCost = getArgPassingCost(CS, F);
+    accumulateSavings(InlineConstants::InstrCost +
+                      InlineConstants::CallPenalty - ArgPassingCost);
   }
 
   return Base::visitCallSite(CS);
@@ -989,9 +1177,14 @@
   // shouldn't exist at all, but handling them makes the behavior of the
   // inliner more regular and predictable. Interestingly, conditional branches
   // which will fold away are also free.
-  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
-         dyn_cast_or_null<ConstantInt>(
-             SimplifiedValues.lookup(BI.getCondition()));
+  if (BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()))
+    return true;
+  if (dyn_cast_or_null<ConstantInt>(
+          SimplifiedValues.lookup(BI.getCondition()))) {
+    accumulateSavings(InlineConstants::InstrCost);
+    return true;
+  }
+  return false;
 }
 
 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
@@ -999,26 +1192,35 @@
   // branches.
   if (isa<ConstantInt>(SI.getCondition()))
     return true;
+
+  // Lambda to compute the cost of switch instruction.
+  auto SwitchInstCost = [](SwitchInst &SI) {
+    // We need to compute a cost proportional to the number of
+    // distinct successor blocks. This fan-out in the CFG cannot be represented
+    // for free even if we can represent the core switch as a jumptable that
+    // takes a single instruction.
+    //
+    // NB: We convert large switches which are just used to initialize large phi
+    // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
+    // inlining those. It will prevent inlining in cases where the optimization
+    // does not (yet) fire.
+    SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
+    SuccessorBlocks.insert(SI.getDefaultDest());
+    for (auto Case : SI.cases())
+      SuccessorBlocks.insert(Case.getCaseSuccessor());
+
+    // Add cost corresponding to the number of distinct destinations. The first
+    // we model as free because of fallthrough.
+    return (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  };
+
   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
-    if (isa<ConstantInt>(V))
+    if (isa<ConstantInt>(V)) {
+      accumulateSavings(SwitchInstCost(SI));
       return true;
+    }
 
-  // Otherwise, we need to accumulate a cost proportional to the number of
-  // distinct successor blocks. This fan-out in the CFG cannot be represented
-  // for free even if we can represent the core switch as a jumptable that
-  // takes a single instruction.
-  //
-  // NB: We convert large switches which are just used to initialize large phi
-  // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
-  // inlining those. It will prevent inlining in cases where the optimization
-  // does not (yet) fire.
-  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
-  SuccessorBlocks.insert(SI.getDefaultDest());
-  for (auto Case : SI.cases())
-    SuccessorBlocks.insert(Case.getCaseSuccessor());
-  // Add cost corresponding to the number of distinct destinations. The first
-  // we model as free because of fallthrough.
-  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+  accumulateCost(SwitchInstCost(SI));
   return false;
 }
 
@@ -1083,6 +1285,9 @@
 /// viable, and true if inlining remains viable.
 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
                                 SmallPtrSetImpl<const Value *> &EphValues) {
+  if (CalleeBFI)
+    CurrBBFreq = CalleeBFI->getBlockFreq(BB).getFrequency();
+
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     // FIXME: Currently, the number of instructions in a function regardless of
     // our ability to simplify them during inline to constants or dead code,
@@ -1119,7 +1324,7 @@
 
       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
           hasSoftFloatAttr)
-        Cost += InlineConstants::CallPenalty;
+        accumulateCost(InlineConstants::CallPenalty);
     }
 
     // If the instruction simplified to a constant, there is no cost to this
@@ -1130,7 +1335,7 @@
     if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
-      Cost += InlineConstants::InstrCost;
+      accumulateCost(InlineConstants::InstrCost);
 
     // If the visit this instruction detected an uninlinable pattern, abort.
     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
@@ -1215,54 +1420,46 @@
   assert(NumInstructions == 0);
   assert(NumVectorInstructions == 0);
 
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  CallerBFI = nullptr;
+  CalleeBFI = nullptr;
+  if (GetBFI) {
+    CallerBFI = &((*GetBFI)(*Caller));
+    if (!F.isDeclaration()) {
+      CalleeBFI = &((*GetBFI)(F));
+      // While evaluating the weighted savings due to removal of argument setup
+      // and the call overhead, we want to use the entry block's frequency.
+      CurrBBFreq = CalleeBFI->getEntryFreq();
+    }
+  }
   // Update the threshold based on callsite properties
   updateThreshold(CS, F);
 
   FiftyPercentVectorBonus = 3 * Threshold / 2;
   TenPercentVectorBonus = 3 * Threshold / 4;
-  const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Track whether the post-inlining function would have more than one basic
   // block. A single basic block is often intended for inlining. Balloon the
   // threshold by 50% until we pass the single-BB phase.
   bool SingleBB = true;
   int SingleBBBonus = Threshold / 2;
+  int SpeedupBonus = getSpeedupBonus(CS, Threshold);
 
   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
   // this Threshold any time, and cost cannot decrease, we can stop processing
   // the rest of the function body.
-  Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
+  Threshold += (SingleBBBonus + FiftyPercentVectorBonus + SpeedupBonus);
 
-  // Give out bonuses per argument, as the instructions setting them up will
-  // be gone after inlining.
-  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
-    if (CS.isByValArgument(I)) {
-      // We approximate the number of loads and stores needed by dividing the
-      // size of the byval type by the target's pointer size.
-      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
-      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
-      unsigned PointerSize = DL.getPointerSizeInBits();
-      // Ceiling division.
-      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+  int ArgPassingCost = getArgPassingCost(CS, &F);
+  accumulateCost(ArgPassingCost);
+  // Argument passing cost is negative. We negate that to get savings due to
+  // inlining.
+  accumulateSavings(-ArgPassingCost);
 
-      // If it generates more than 8 stores it is likely to be expanded as an
-      // inline memcpy so we take that as an upper bound. Otherwise we assume
-      // one load and one store per word copied.
-      // FIXME: The maxStoresPerMemcpy setting from the target should be used
-      // here instead of a magic number of 8, but it's not available via
-      // DataLayout.
-      NumStores = std::min(NumStores, 8U);
-
-      Cost -= 2 * NumStores * InlineConstants::InstrCost;
-    } else {
-      // For non-byval arguments subtract off one instruction per call
-      // argument.
-      Cost -= InlineConstants::InstrCost;
-    }
-  }
   // The call instruction also disappears after inlining.
-  Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty;
-  
+  accumulateCost(-InlineConstants::InstrCost - InlineConstants::CallPenalty);
+  accumulateSavings(InlineConstants::InstrCost + InlineConstants::CallPenalty);
+
   // If there is only one call of the function, and it has internal linkage,
   // the cost of inlining it drops dramatically.
   bool OnlyOneCallAndLocalLinkage =
@@ -1273,7 +1470,7 @@
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
   if (F.getCallingConv() == CallingConv::Cold)
-    Cost += InlineConstants::ColdccPenalty;
+    accumulateCost(InlineConstants::ColdccPenalty);
 
   // Check if we're done. This can happen due to bonuses and penalties.
   if (Cost > Threshold)
@@ -1282,7 +1479,6 @@
   if (F.empty())
     return true;
 
-  Function *Caller = CS.getInstruction()->getParent()->getParent();
   // Check if the caller function is recursive itself.
   for (User *U : Caller->users()) {
     CallSite Site(U);
@@ -1311,7 +1507,7 @@
       // We can SROA any pointer arguments derived from alloca instructions.
       if (isa<AllocaInst>(PtrArg)) {
         SROAArgValues[&*FAI] = PtrArg;
-        SROAArgCosts[PtrArg] = 0;
+        SROAArgCosts[PtrArg] = {0, 0};
       }
     }
   }
@@ -1415,6 +1611,9 @@
   else if (NumVectorInstructions <= NumInstructions / 2)
     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
 
+  if (!hasLargeSpeedup())
+    Threshold -= SpeedupBonus;
+
   return Cost < std::max(1, Threshold);
 }
 
@@ -1433,6 +1632,8 @@
   DEBUG_PRINT_STAT(SROACostSavingsLost);
   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
   DEBUG_PRINT_STAT(Cost);
+  DEBUG_PRINT_STAT(WeightedCost);
+  DEBUG_PRINT_STAT(WeightedSavings);
   DEBUG_PRINT_STAT(Threshold);
 #undef DEBUG_PRINT_STAT
 }
Index: test/Transforms/Inline/speedup-analysis.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/speedup-analysis.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=10 -speedup-bonus-percent=1000
+; Test that a callee that does not fit within the threshold gets inlined
+; because of the estimated speedup heuristic.
+define i32 @caller(i32 %n) {
+; CHECK-LABEL: define i32 @caller
+entry:
+  br label %loop
+loop:
+  %r = phi i32 [%n, %entry], [%result, %loop]
+; CHECK-NOT: call i32 @callee
+  %result = call i32 @callee(i32 %r)
+  %cond = icmp sle i32 %result, 100
+  br i1 %cond, label %loop, label %exit
+exit:
+; CHECK: ret
+  ret i32 %result
+}
+
+define i32 @callee(i32 %n) {
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false, !prof !0
+
+cond_true:
+  %n1 = add i32 %n, 1
+  %n2 = add i32 %n1, 1
+  ret i32 %n2
+cond_false:
+  call void @extern()
+  call void @extern()
+  call void @extern()
+  ret i32 0
+}
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 1, i32 0}
Index: test/Transforms/Inline/speedup-analysis2.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/speedup-analysis2.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=10 -speedup-bonus-percent=1000
+; Test that a callee that does not fit within the threshold gets inlined
+; because of the estimated speedup heuristic. The callee has a switch statement.
+; Since only one of the cases is executed per invocation of the callee, the
+; weighted cost of the callee is low and results in a big relative speedup as the
+; benefits of removing the function call is accounted for.
+
+define i32 @caller(i32 %n) {
+; CHECK-LABEL: define i32 @caller
+entry:
+  br label %loop
+loop:
+  %r = phi i32 [%n, %entry], [%result, %loop]
+; CHECK-NOT: call i32 @callee
+  %result = call i32 @callee(i32 %r)
+  %cond = icmp sle i32 %result, 100
+  br i1 %cond, label %loop, label %exit
+exit:
+; CHECK: ret
+  ret i32 %result
+}
+
+define i32 @callee(i32 %n) {
+entry:
+  switch i32 %n, label %return [
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+    i32 7, label %sw.bb7
+  ]
+
+sw.bb1:
+  %r1 = add i32 %n, 1
+  br label %return
+
+sw.bb2:
+  %r2 = add i32 %n, 2
+  br label %return
+
+sw.bb3:
+  %r3 = add i32 %n, 3
+  br label %return
+
+sw.bb4:
+  %r4 = add i32 %n, 4
+  br label %return
+
+sw.bb5:
+  %r5 = add i32 %n, 5
+  br label %return
+
+sw.bb6:
+  %r6 = add i32 %n, 6
+  br label %return
+
+sw.bb7:
+  %r7 = add i32 %n, 7
+  br label %return
+
+return:
+  %res = phi i32 [%n, %entry], [%r1, %sw.bb1], [%r2, %sw.bb2], [%r3, %sw.bb3], [%r4, %sw.bb4], [%r5, %sw.bb5], [%r6, %sw.bb6], [%r7, %sw.bb7]
+  ret i32 %res
+
+}
+declare void @extern()
+