Index: include/llvm/Transforms/Utils/CodeExtractor.h
===================================================================
--- include/llvm/Transforms/Utils/CodeExtractor.h
+++ include/llvm/Transforms/Utils/CodeExtractor.h
@@ -86,8 +86,10 @@
     /// \brief Perform the extraction, returning the new function.
     ///
     /// Returns zero when called on a CodeExtractor instance where isEligible
-    /// returns false.
-    Function *extractCodeRegion();
+    /// returns false. If codeReplacer is not null, the pointer to the basic
+    /// block in which the call sequence to the extracted function will be
+    /// stored in *codeReplacer.
+    Function *extractCodeRegion(BasicBlock **codeReplacer = nullptr);
 
     /// \brief Test whether this code extractor is eligible.
     ///
Index: lib/Transforms/IPO/PartialInlining.cpp
===================================================================
--- lib/Transforms/IPO/PartialInlining.cpp
+++ lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,11 @@
 static cl::opt<bool>
     DisablePartialInlining("disable-partial-inlining", cl::init(false),
                            cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::Hidden,
+                                      cl::desc("Skip Cost Analysis"));
 
 static cl::opt<unsigned> MaxNumInlineBlocks(
     "max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -84,8 +90,6 @@
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
-  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
 private:
   int NumPartialInlining = 0;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -93,11 +97,54 @@
   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
   ProfileSummaryInfo *PSI;
 
-  bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE);
+  bool shouldPartialInline(CallSite CS, BlockFrequencyInfo *CalleeBFI,
+                           BasicBlock *OutliningCallBB,
+                           int OutliningCallOverhead,
+                           OptimizationRemarkEmitter &ORE);
   bool IsLimitReached() {
     return (MaxNumPartialInlining != -1 &&
             NumPartialInlining >= MaxNumPartialInlining);
   }
+  CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+  CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function 'OutlinedFunction', including the addtional
+  //   setup cost in the outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block 'OutliningCallBB';
+  // - The third value is the estimated size of the original code from
+  //   function 'F' that is extracted into the outlined function.
+  std::tuple<int, int, int>
+  computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo,
+                        Function *OutlinedFunction,
+                        BasicBlock *OutliningCallBB);
+  // Compute the 'Inline' of block BB. InlineCost is a proxy usedto approximate
+  // both the size and runtime cost (Note that in the current inline cost
+  // analysis, there is no clear distinction there either).
+  int computeBBInlineCost(BasicBlock *BB);
+
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -290,9 +337,15 @@
 }
 
 bool PartialInlinerImpl::shouldPartialInline(CallSite CS,
+                                             BlockFrequencyInfo *CalleeBFI,
+                                             BasicBlock *OutliningCallBB,
+                                             int NonWeightedOutliningRcost,
                                              OptimizationRemarkEmitter &ORE) {
   // TODO : more sharing with shouldInline in Inliner.cpp
   using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
@@ -316,7 +369,7 @@
   }
 
   if (!IC) {
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
              << NV("Callee", Callee) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
@@ -324,6 +377,34 @@
     return false;
   }
 
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+
+  BlockFrequency CalleeEntryFreq =
+      CalleeBFI->getBlockFreq(&Callee->getEntryBlock());
+  BlockFrequency OutliningCallFreq = CalleeBFI->getBlockFreq(OutliningCallBB);
+
+  BranchProbability RelativeFreq = BranchProbability::getBranchProbability(
+      OutliningCallFreq.getFrequency(), CalleeEntryFreq.getFrequency());
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+  BlockFrequency NormWeightedRcost =
+      BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq;
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < NormWeightedRcost) {
+    ORE.emit(
+        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+        << NV("Callee", Callee) << " not partially inlined into "
+        << NV("Caller", Caller) << " runtime overhead (overhead="
+        << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency())
+        << ", savings="
+        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+        << " of making the outlined call is too high");
+
+    return false;
+  }
+
   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
            << NV("Callee", Callee) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
@@ -332,6 +413,65 @@
   return true;
 }
 
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+  int InlineCost = 0;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    if (dyn_cast<CallInst>(I)) {
+      InlineCost += InlineConstants::CallPenalty;
+      continue;
+    }
+    if (dyn_cast<InvokeInst>(I)) {
+      InlineCost += InlineConstants::CallPenalty;
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      InlineCost += (SI->getNumCases() + 1);
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts(
+    Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction,
+    BasicBlock *OutliningCallBB) {
+  // First compute the cost of the outlined region 'OI' in the original
+  // function 'F':
+  int OutlinedRegionCost = 0;
+  for (BasicBlock &BB : *F) {
+    if (&BB != OI->ReturnBlock &&
+        // Assuming Entry set is small -- do a linear search here.
+        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+            OI->Entries.end()) {
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
+  }
+
+  // Now compute the cost of the call sequence to the outlined function
+  // 'OutlinedFunction' in BB 'OutliningCallBB':
+  int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB);
+
+  // Now compute the cost of the extracted/outlined function itself:
+  int OutlinedFunctionCost = 0;
+  for (BasicBlock &BB : *OutlinedFunction) {
+    OutlinedFunctionCost += computeBBInlineCost(&BB);
+  }
+
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost);
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead,
+                         OutlinedRegionCost);
+}
+
 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   if (F->hasAddressTaken())
@@ -347,6 +487,9 @@
   if (PSI->isFunctionEntryCold(F))
     return nullptr;
 
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
       computeOutliningInfo(F);
 
@@ -443,28 +586,53 @@
   BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
 
   // Extract the body of the if.
+  BasicBlock *codeReplacer = 0;
   Function *ExtractedFunction =
       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
-          .extractCodeRegion();
+          .extractCodeRegion(&codeReplacer);
+
+  int NonWeightedRcost;
+  int SizeCost;
+  int OutlinedRegionSizeCost;
+
+  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) =
+      computeOutliningCosts(F, OutliningInfo.get(), ExtractedFunction,
+                            codeReplacer);
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // 'F' with outlining (The inliner using the size increase to model the
+  // the cost of inlining a callee).
+  if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(F);
+    DebugLoc DLoc;
+    BasicBlock *Block;
+    std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction);
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", F)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")");
+    DuplicateFunction->replaceAllUsesWith(F);
+    DuplicateFunction->eraseFromParent();
+    ExtractedFunction->eraseFromParent();
+    return nullptr;
+  }
 
   // Inline the top-level if test into all callers.
   std::vector<User *> Users(DuplicateFunction->user_begin(),
                             DuplicateFunction->user_end());
 
   for (User *User : Users) {
-    CallSite CS;
-    if (CallInst *CI = dyn_cast<CallInst>(User))
-      CS = CallSite(CI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
-      CS = CallSite(II);
-    else
-      llvm_unreachable("All uses must be calls");
+    CallSite CS = getCallSite(User);
 
     if (IsLimitReached())
       continue;
 
     OptimizationRemarkEmitter ORE(CS.getCaller());
-    if (!shouldPartialInline(CS, ORE))
+    if (!shouldPartialInline(CS, &BFI, codeReplacer, NonWeightedRcost, ORE))
       continue;
 
     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
@@ -485,7 +653,6 @@
   DuplicateFunction->replaceAllUsesWith(F);
   DuplicateFunction->eraseFromParent();
 
-
   return ExtractedFunction;
 }
 
Index: lib/Transforms/Utils/CodeExtractor.cpp
===================================================================
--- lib/Transforms/Utils/CodeExtractor.cpp
+++ lib/Transforms/Utils/CodeExtractor.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
@@ -714,7 +715,7 @@
       MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
 }
 
-Function *CodeExtractor::extractCodeRegion() {
+Function *CodeExtractor::extractCodeRegion(BasicBlock **codeReplacerP) {
   if (!isEligible())
     return nullptr;
 
@@ -750,6 +751,8 @@
   BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(), 
                                                 "codeRepl", oldFunction,
                                                 header);
+  if (codeReplacerP)
+    *codeReplacerP = codeReplacer;
 
   // The new function needs a root node because other nodes can branch to the
   // head of the region, but the entry node of a function cannot have preds.
Index: test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
===================================================================
--- test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
+++ test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that the CodeExtractor
 ;  properly sets the entry count for the function that is
Index: test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
===================================================================
--- test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that CodeExtractor updates
 ;  the exit branch probabilities for multiple exit blocks.
Index: test/Transforms/CodeExtractor/PartialInlineHighCost.ll
===================================================================
--- test/Transforms/CodeExtractor/PartialInlineHighCost.ll
+++ test/Transforms/CodeExtractor/PartialInlineHighCost.ll
@@ -0,0 +1,107 @@
+; The outlined region has high frequency  and the outlining
+; call sequence is expensive (input, output, multiple exit etc)
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck --check-prefix=NOCOST %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=NOCOST %s
+
+
+; Function Attrs: nounwind
+define i32 @bar_hot_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+define i32 @bar_cold_outline_region(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !2
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+; Function Attrs: nounwind
+declare i32 @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i32 @goo(i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK-NOT: br i1
+; CHECK-NOT: call{{.*}}bar_hot_outline_region. 
+; NOCOST-LABEL: @dummy_caller
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_hot_outline_region.
+
+  %tmp = tail call i32 @bar_hot_outline_region(i32 %arg)
+  ret i32 %tmp
+}
+
+define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller2
+; CHECK: br i1
+; CHECK: call{{.*}}bar_cold_outline_region.
+; NOCOST-LABEL: @dummy_caller2
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar_cold_outline_region.
+
+  %tmp = tail call i32 @bar_cold_outline_region(i32 %arg)
+  ret i32 %tmp
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301898)"}
+!1 = !{!"branch_weights", i32 2000, i32 1}
+!2 = !{!"branch_weights", i32 1, i32 100}
Index: test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
===================================================================
--- test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
+++ test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -partial-inliner -S | FileCheck %s
 ; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -S | FileCheck --check-prefix=LIMIT3 %s
-; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -S | FileCheck  --check-prefix=LIMIT3 %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis  -S | FileCheck --check-prefix=LIMIT3 %s
+; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=3 -skip-partial-inlining-cost-analysis -S | FileCheck  --check-prefix=LIMIT3 %s
 ; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck --check-prefix=LIMIT2 %s
 ; RUN: opt < %s -passes=partial-inliner -max-num-inline-blocks=2 -S | FileCheck  --check-prefix=LIMIT2 %s
 
Index: test/Transforms/CodeExtractor/SingleCondition.ll
===================================================================
--- test/Transforms/CodeExtractor/SingleCondition.ll
+++ test/Transforms/CodeExtractor/SingleCondition.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner -S  | FileCheck %s
-; RUN: opt < %s -passes=partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -partial-inliner -S  | FileCheck %s
+; RUN: opt < %s -skip-partial-inlining-cost-analysis -passes=partial-inliner -S  | FileCheck %s
 
 define internal i32 @inlinedFunc(i1 %cond, i32* align 4 %align.val) {
 entry:
Index: test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
===================================================================
--- test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
+++ test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner | llc -filetype=null
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 ; This testcase checks to see if CodeExtractor properly inherits
 ;   target specific attributes for the extracted function. This can
 ;   cause certain instructions that depend on the attributes to not