Index: include/llvm/Analysis/InlineCost.h
===================================================================
--- include/llvm/Analysis/InlineCost.h
+++ include/llvm/Analysis/InlineCost.h
@@ -21,6 +21,7 @@
 
 namespace llvm {
 class AssumptionCacheTracker;
+class BlockFrequencyInfo;
 class CallSite;
 class DataLayout;
 class Function;
@@ -137,6 +138,9 @@
 
   /// Threshold to use when the callsite is considered hot.
   Optional<int> HotCallSiteThreshold;
+
+  /// Threshold to use when the callsite is considered cold.
+  Optional<int> ColdCallSiteThreshold;
 };
 
 /// Generate the parameters to tune the inline cost analysis based only on the
@@ -171,6 +175,7 @@
 getInlineCost(CallSite CS, const InlineParams &Params,
               TargetTransformInfo &CalleeTTI,
               std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+              std::function<BlockFrequencyInfo &(Function &)> *GetBFI,
               ProfileSummaryInfo *PSI);
 
 /// \brief Get an InlineCost with the callee explicitly specified.
@@ -182,6 +187,7 @@
 getInlineCost(CallSite CS, Function *Callee, const InlineParams &Params,
               TargetTransformInfo &CalleeTTI,
               std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+              std::function<BlockFrequencyInfo &(Function &)> *GetBFI,
               ProfileSummaryInfo *PSI);
 
 /// \brief Minimal filter to detect invalid constructs for inlining.
Index: include/llvm/Transforms/Utils/Cloning.h
===================================================================
--- include/llvm/Transforms/Utils/Cloning.h
+++ include/llvm/Transforms/Utils/Cloning.h
@@ -47,6 +47,7 @@
 class LoopInfo;
 class AllocaInst;
 class AssumptionCacheTracker;
+class BlockFrequencyInfo;
 class DominatorTree;
 
 /// Return an exact copy of the specified module
@@ -80,6 +81,9 @@
   /// originally inserted callsites were DCE'ed after they were cloned.
   std::vector<WeakVH> OperandBundleCallSites;
 
+  /// Map from original block to new blocks updated by the cloner.
+  std::unique_ptr<DenseMap<const BasicBlock *, BasicBlock *>> ClonedBlocks;
+
   ClonedCodeInfo() : ContainsCalls(false), ContainsDynamicAllocas(false) {}
 };
 
@@ -176,15 +180,18 @@
 /// InlineFunction call, and records the auxiliary results produced by it.
 class InlineFunctionInfo {
 public:
-  explicit InlineFunctionInfo(CallGraph *cg = nullptr,
-                              std::function<AssumptionCache &(Function &)>
-                                  *GetAssumptionCache = nullptr)
-      : CG(cg), GetAssumptionCache(GetAssumptionCache) {}
+  explicit InlineFunctionInfo(
+      CallGraph *cg = nullptr,
+      std::function<AssumptionCache &(Function &)> *GetAssumptionCache =
+          nullptr,
+      std::function<BlockFrequencyInfo &(Function &)> *GetBFI = nullptr)
+      : CG(cg), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI) {}
 
   /// CG - If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+  std::function<BlockFrequencyInfo &(Function &)> *GetBFI;
 
   /// StaticAllocas - InlineFunction fills this in with all static allocas that
   /// get copied into the caller.
Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -48,6 +49,11 @@
     "inlinehint-threshold", cl::Hidden, cl::init(325),
     cl::desc("Threshold for inlining functions with inline hint"));
 
+static cl::opt<int>
+    ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
+                          cl::init(45),
+                          cl::desc("Threshold for inlining cold callsites"));
+
 // We introduce this threshold to help performance of instrumentation based
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
@@ -72,6 +78,9 @@
   /// Getter for the cache of @llvm.assume intrinsics.
   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
 
+  /// Getter for BlockFrequencyInfo
+  std::function<BlockFrequencyInfo &(Function &)> *GetBFI;
+
   /// Profile summary information.
   ProfileSummaryInfo *PSI;
 
@@ -85,7 +94,6 @@
 
   /// Tunable parameters that control the analysis.
   const InlineParams &Params;
-
   int Threshold;
   int Cost;
 
@@ -202,19 +210,21 @@
 public:
   CallAnalyzer(const TargetTransformInfo &TTI,
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+               std::function<BlockFrequencyInfo &(Function &)> *GetBFI,
                ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
                const InlineParams &Params)
-      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee),
-        CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
-        Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
-        ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
+        PSI(PSI), F(Callee), CandidateCS(CSArg), Params(Params),
+        Threshold(Params.DefaultThreshold), Cost(0), IsCallerRecursive(false),
+        IsRecursiveCall(false), ExposesReturnsTwice(false),
+        HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
+        HasReturn(false), HasIndirectBr(false), HasFrameEscape(false),
+        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
+        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -642,15 +652,18 @@
       PSI->isHotCount(TotalWeight)) {
     HotCallsite = true;
   }
+  bool UseCallsiteHotness = GetBFI != nullptr;
 
   // Listen to the inlinehint attribute or profile based hotness information
   // when it would increase the threshold and the caller does not need to
   // minimize its size.
-  bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
-                    (PSI && PSI->isFunctionEntryHot(&Callee));
+  bool InlineHint =
+      Callee.hasFnAttribute(Attribute::InlineHint) ||
+      (!UseCallsiteHotness && PSI && PSI->isFunctionEntryHot(&Callee));
   if (InlineHint && !Caller->optForMinSize())
     Threshold = MaxIfValid(Threshold, Params.HintThreshold);
 
+  BasicBlock *CallBB = CS.getInstruction()->getParent();
   if (HotCallsite && !Caller->optForMinSize())
     Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
 
@@ -659,6 +672,19 @@
   // the threshold.
   if (ColdCallee)
     Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+  if (UseCallsiteHotness && PSI && CallBB) {
+    auto &CallerBFI = (*GetBFI)(*Caller);
+    auto CallsiteCount = CallerBFI.getBlockProfileCount(CallBB);
+    if (CallsiteCount) {
+      DEBUG(llvm::dbgs()
+            << "     Callsite count =  " << CallsiteCount.getValue()
+            << (PSI->isHotCount(CallsiteCount.getValue()) ? " (HOT)\n" : "\n"));
+      if (PSI->isHotCount(CallsiteCount.getValue()))
+        Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
+      else if (PSI->isColdCount(CallsiteCount.getValue()))
+        Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
+    }
+  }
 
   // Finally, take the target-specific inlining threshold multiplier into
   // account.
@@ -962,7 +988,8 @@
   // out. Pretend to inline the function, with a custom threshold.
   auto IndirectCallParams = Params;
   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
-  CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams);
+  CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
+                  IndirectCallParams);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // threshold to get the bonus we want to apply, but don't go below zero.
@@ -1452,15 +1479,17 @@
 InlineCost llvm::getInlineCost(
     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+    std::function<BlockFrequencyInfo &(Function &)> *GetBFI,
     ProfileSummaryInfo *PSI) {
   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
-                       GetAssumptionCache, PSI);
+                       GetAssumptionCache, GetBFI, PSI);
 }
 
 InlineCost llvm::getInlineCost(
     CallSite CS, Function *Callee, const InlineParams &Params,
     TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+    std::function<BlockFrequencyInfo &(Function &)> *GetBFI,
     ProfileSummaryInfo *PSI) {
 
   // Cannot inline indirect calls.
@@ -1495,7 +1524,8 @@
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                      << "...\n");
 
-  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params);
+  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
+                  Params);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
@@ -1568,6 +1598,9 @@
   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
   Params.HotCallSiteThreshold = HotCallSiteThreshold;
 
+  // Set the ColdCallSiteThreshold knob from the -cold-callsite-threshold.
+  Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
+
   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
   // -inlinehint-threshold commandline option is not explicitly given. If that
Index: lib/Transforms/IPO/InlineSimple.cpp
===================================================================
--- lib/Transforms/IPO/InlineSimple.cpp
+++ lib/Transforms/IPO/InlineSimple.cpp
@@ -61,7 +61,8 @@
         [&](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
-    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI);
+    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
+                               /*GetBFI=*/nullptr, PSI);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
Index: lib/Transforms/IPO/Inliner.cpp
===================================================================
--- lib/Transforms/IPO/Inliner.cpp
+++ lib/Transforms/IPO/Inliner.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -766,14 +767,20 @@
     return FAM.getResult<AssumptionAnalysis>(F);
   };
 
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI =
+      [&](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
   // Setup the data structure used to plumb customization into the
   // `InlineFunction` routine.
-  InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache);
+  InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache, &GetBFI);
 
   auto GetInlineCost = [&](CallSite CS) {
     Function &Callee = *CS.getCalledFunction();
     auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
-    return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI);
+    return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, &GetBFI,
+                         PSI);
   };
 
   // We use a worklist of nodes to process so that we can handle if the SCC
Index: lib/Transforms/Utils/CloneFunction.cpp
===================================================================
--- lib/Transforms/Utils/CloneFunction.cpp
+++ lib/Transforms/Utils/CloneFunction.cpp
@@ -255,6 +255,8 @@
   // Nope, clone it now.
   BasicBlock *NewBB;
   BBEntry = NewBB = BasicBlock::Create(BB->getContext());
+  if (CodeInfo->ClonedBlocks)
+    (*CodeInfo->ClonedBlocks)[BB] = NewBB;
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
 
   // It is only legal to clone a function if a block address within that
Index: lib/Transforms/Utils/InlineFunction.cpp
===================================================================
--- lib/Transforms/Utils/InlineFunction.cpp
+++ lib/Transforms/Utils/InlineFunction.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -1394,6 +1395,60 @@
   }
 }
 
+/// Update the block frequencies of the caller after a callee has been inlined.
+///
+/// Each block cloned into the caller has its block frequency scaled by the
+/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
+/// callee's entry block gets the same freqquency as the callsite block and the
+/// relative frequencies of all cloned blocks remain the same after cloning.
+static void
+updateCallerBFI(BasicBlock *CallSiteBlock,
+                DenseMap<const BasicBlock *, BasicBlock *> *ClonedBlocks,
+                InlineFunctionInfo &IFI, BlockFrequencyInfo &CallerBFI,
+                BlockFrequencyInfo &CalleeBFI) {
+  if (!ClonedBlocks)
+    return;
+
+  for (auto &Entry : *ClonedBlocks) {
+    // Use 128 bits APInt to avoid overflow.
+    APInt CallSiteFreq(128,
+                       CallerBFI.getBlockFreq(CallSiteBlock).getFrequency());
+    APInt CalleeEntryFreq(128, CalleeBFI.getEntryFreq());
+    APInt BBFreq(128, CalleeBFI.getBlockFreq(Entry.first).getFrequency());
+
+    // Multiply first by CallSiteFreq and then divide by CalleeEntryFreq
+    // to minimize loss of precision.
+    BBFreq *= CallSiteFreq;
+    BBFreq = BBFreq.udiv(CalleeEntryFreq);
+
+    CallerBFI.setBlockFreq(Entry.second, BBFreq.getLimitedValue());
+  }
+}
+
+/// Update the entry count of callee after inlining.
+///
+/// The callsite's block count is subtracted from the callee's function entry
+/// count.
+static void updateCalleeCount(BlockFrequencyInfo &CallerBFI, BasicBlock *CallBB,
+                              Function *Callee) {
+  // If the callee has a original count of N, and the estimated count of
+  // callsite is M, the new callee count is set to N - M. M is estimated from
+  // the caller's entry count, its entry block frequency and the block frequency
+  // of the callsite.
+  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  if (!CalleeCount)
+    return;
+  Optional<uint64_t> CallSiteCount = CallerBFI.getBlockProfileCount(CallBB);
+  if (!CallSiteCount)
+    return;
+  // Since CallSiteCount is an estimate, it could exceed the original callee
+  // count and has to be set to 0.
+  if (CallSiteCount.getValue() > CalleeCount.getValue())
+    Callee->setEntryCount(0);
+  else
+    Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue());
+}
+
 /// This function inlines the called function into the basic block of the
 /// caller. This returns false if it is not possible to inline this call.
 /// The program is still in a well defined state if this occurs though.
@@ -1410,8 +1465,8 @@
 
   // If IFI has any state in it, zap it before we fill it in.
   IFI.reset();
-  
-  const Function *CalledFunc = CS.getCalledFunction();
+
+  Function *CalledFunc = CS.getCalledFunction();
   if (!CalledFunc ||              // Can't inline external function or indirect
       CalledFunc->isDeclaration() || // call, or call to a vararg function!
       CalledFunc->getFunctionType()->isVarArg()) return false;
@@ -1532,6 +1587,8 @@
   // function.
   SmallVector<ReturnInst*, 8> Returns;
   ClonedCodeInfo InlinedFunctionInfo;
+  InlinedFunctionInfo.ClonedBlocks.reset(
+      new DenseMap<const BasicBlock *, BasicBlock *>());
   Function::iterator FirstNewBlock;
 
   { // Scope to destroy VMap after cloning.
@@ -1578,6 +1635,15 @@
     CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                               /*ModuleLevelChanges=*/false, Returns, ".i",
                               &InlinedFunctionInfo, TheCall);
+    if (IFI.GetBFI) {
+      auto &CallerBFI = (*IFI.GetBFI)(*Caller);
+      auto &CalleeBFI = (*IFI.GetBFI)(*CalledFunc);
+      // Update the block frequencies of the cloned blocks in the caller
+      updateCallerBFI(OrigBB, InlinedFunctionInfo.ClonedBlocks.get(), IFI,
+                      CallerBFI, CalleeBFI);
+      // Update the profile count of callee.
+      updateCalleeCount(CallerBFI, OrigBB, CalledFunc);
+    }
 
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
@@ -2182,6 +2248,12 @@
   // unreachable, delete it.  It can only contain a bitcast and ret.
   if (InlinedMustTailCalls && pred_begin(AfterCallBB) == pred_end(AfterCallBB))
     AfterCallBB->eraseFromParent();
+  else if (IFI.GetBFI) {
+    auto &CallerBFI = (*IFI.GetBFI)(*Caller);
+    // Copy original BB's block frequency to AfterCallBB
+    CallerBFI.setBlockFreq(AfterCallBB,
+                           CallerBFI.getBlockFreq(OrigBB).getFrequency());
+  }
 
   // We should always be able to fold the entry block of the function into the
   // single predecessor of the block...
Index: test/Transforms/Inline/function-count-update-2.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/function-count-update-2.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
+
+; This tests that the function count of a callee gets correctly updated after it
+; has been inlined into a two callsites.
+
+; CHECK: @callee() !prof [[COUNT:![0-9]+]]
+define i32 @callee() !prof !1 {
+  ret i32 0
+}
+
+define i32 @caller1() !prof !2 {
+  %i = call i32 @callee()
+  ret i32 %i
+}
+
+define i32 @caller2() !prof !3 {
+  %i = call i32 @callee()
+  ret i32 %i
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT]] = !{!"function_entry_count", i64 0}
+!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"function_entry_count", i64 600}
+!3 = !{!"function_entry_count", i64 400}
+
Index: test/Transforms/Inline/function-count-update-3.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/function-count-update-3.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S -inline-threshold=50 | FileCheck %s
+
+; This tests that the function count of a function gets properly scaled after 
+; inlining a call chain leading to the function.
+; Function a calls c with count 200 (C1)
+; Function b calls c with count 300
+; Function c calls e with count 250 (C2)
+; Entry count of e is 500 (C3)
+; c->e inlining does not happen since the cost exceeds threshold.
+; c then inlined into a.
+; e now gets inlined into a (through c) since the branch condition in e is now
+; known and hence the cost gets reduced.
+; Estimated count of a->e callsite = C2 * (C1 / C3)
+; Estimated count of a->e callsite = 250 * (200 / 500) = 100
+; Remaining count of e = C3 - 100 = 500 - 100 = 400
+
+@data = external global i32
+
+define i32 @a(i32 %a1) !prof !1 {
+  %a2 = call i32 @c(i32 %a1, i32 1)
+  ret i32 %a2
+}
+
+define i32 @b(i32 %b1) !prof !2 {
+  %b2 = call i32 @c(i32 %b1, i32 %b1)
+  ret i32 %b2
+}
+
+declare void @ext();
+
+define i32 @c(i32 %c1, i32 %c100) !prof !3 {
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  ret i32 0
+
+cond_true:
+  %c11 = call i32 @e(i32 %c100)
+  ret i32 %c11
+}
+
+
+; CHECK: @e(i32 %c1) !prof [[COUNT:![0-9]+]]
+define i32 @e(i32 %c1) !prof !4 {
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  call void @ext()
+  %c2 = load i32, i32* @data, align 4
+  %c3 = add i32 %c1, %c2
+  %c4 = mul i32 %c3, %c2
+  %c5 = add i32 %c4, %c2
+  %c6 = mul i32 %c5, %c2
+  %c7 = add i32 %c6, %c2
+  %c8 = mul i32 %c7, %c2
+  %c9 = add i32 %c8, %c2
+  %c10 = mul i32 %c9, %c2
+  ret i32 %c10
+
+cond_true:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT]] = !{!"function_entry_count", i64 400}
+!0 = !{i32 1, !"MaxFunctionCount", i32 5000}
+!1 = !{!"function_entry_count", i64 200}
+!2 = !{!"function_entry_count", i64 300}
+!3 = !{!"function_entry_count", i64 500}
+!4 = !{!"function_entry_count", i64 500}
+
Index: test/Transforms/Inline/function-count-update.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/function-count-update.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
+
+; This tests that the function count of two callees get correctly updated after
+; they have been inlined into two back-to-back callsites in a single basic block
+; in the caller. The callees have the alwaysinline attribute and so they get
+; inlined both with the regular inliner pass and the always inline pass. In
+; both cases, the new count of each callee is the original count minus callsite
+; count which is 200 (since the caller's entry count is 400 and the block
+; containing the calls have a relative block frequency of 0.5).
+
+; CHECK: @callee1(i32 %n) #0 !prof [[COUNT1:![0-9]+]]
+define i32 @callee1(i32 %n) #0 !prof !1 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_true:
+  %r1 = add i32 %n, 1
+  ret i32 %r1
+cond_false:
+  %r2 = add i32 %n, 2
+  ret i32 %r2
+}
+
+; CHECK: @callee2(i32 %n) #0 !prof [[COUNT2:![0-9]+]]
+define i32 @callee2(i32 %n) #0 !prof !2 {
+  %r1 = add i32 %n, 1
+  ret i32 %r1
+}
+
+define i32 @caller(i32 %n) !prof !3 {
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_true:
+  %i = call i32 @callee1(i32 %n)
+  %j = call i32 @callee2(i32 %i)
+  ret i32 %j
+cond_false:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 800}
+; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 1800}
+!0 = !{i32 1, !"MaxFunctionCount", i32 1000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"function_entry_count", i64 2000}
+!3 = !{!"function_entry_count", i64 400}
+attributes #0 = { alwaysinline }
+
Index: test/Transforms/Inline/inline-cold-callsite.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/inline-cold-callsite.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=100 -inline-cold-callsite-threshold=0 -S | FileCheck %s
+
+; This tests that a cold callsite gets the inline-cold-callsite-threshold
+; and does not get inlined. Another callsite to an identical callee that
+; is not cold gets inlined because cost is below the inline-threshold.
+
+define i32 @callee1(i32 %x) !prof !21 {
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @callee2(i32 %x) !prof !21 {
+; CHECK-LABEL: @callee2(
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @caller(i32 %n) !prof !22 {
+; CHECK-LABEL: @caller(
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false, !prof !0
+
+cond_true:
+; CHECK-NOT: call i32 @callee1
+; CHECK: ret i32 %x3.i
+  %i = call i32 @callee1(i32 %n)
+  ret i32 %i
+cond_false:
+; CHECK: call i32 @callee2
+; CHECK: ret i32 %j
+  %j = call i32 @callee2(i32 %n)
+  ret i32 %j
+}
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 200, i32 1}
+
+!llvm.module.flags = !{!1}
+!21 = !{!"function_entry_count", i64 200}
+!22 = !{!"function_entry_count", i64 200}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
Index: test/Transforms/Inline/inline-hot-callee.ll
===================================================================
--- test/Transforms/Inline/inline-hot-callee.ll
+++ test/Transforms/Inline/inline-hot-callee.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -inline -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s
-; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s
 
 ; This tests that a hot callee gets the (higher) inlinehint-threshold even without
 ; inline hints and gets inlined because the cost is less than inlinehint-threshold.
Index: test/Transforms/Inline/inline-hot-callsite-2.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/inline-hot-callsite-2.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=0 -hot-callsite-threshold=100 -S | FileCheck %s
+
+; This tests that a callsite which is determined to be hot based on the caller's
+; entry count and the callsite block frequency gets the hot-callsite-threshold.
+; Another callsite to an identical callee that is not hot does not get inlined
+; because cost exceeds the inline-threshold
+
+define i32 @callee1(i32 %x) !prof !21 {
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @callee2(i32 %x) !prof !21 {
+; CHECK-LABEL: @callee2(
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @caller(i32 %n) !prof !22 {
+; CHECK-LABEL: @caller(
+  %cond = icmp sle i32 %n, 100
+  br i1 %cond, label %cond_true, label %cond_false, !prof !0
+
+cond_true:
+; CHECK-NOT: call i32 @callee1
+; CHECK: ret i32 %x3.i
+  %i = call i32 @callee1(i32 %n)
+  ret i32 %i
+cond_false:
+; CHECK: call i32 @callee2
+; CHECK: ret i32 %j
+  %j = call i32 @callee2(i32 %n)
+  ret i32 %j
+}
+declare void @extern()
+
+!0 = !{!"branch_weights", i32 64, i32 4}
+
+!llvm.module.flags = !{!1}
+!21 = !{!"function_entry_count", i64 200}
+!22 = !{!"function_entry_count", i64 200}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}