Index: include/llvm/Transforms/Utils/CodeExtractor.h
===================================================================
--- include/llvm/Transforms/Utils/CodeExtractor.h
+++ include/llvm/Transforms/Utils/CodeExtractor.h
@@ -83,6 +83,11 @@
                   BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr);
 
+    /// Return the costs of outlining the extracted region.
+    /// The first member of the returned tuple is the estimate size
+    /// cost and the second is runtime cost.
+    std::tuple<int, int> computeOutliningCost();
+
     /// \brief Perform the extraction, returning the new function.
     ///
     /// Returns zero when called on a CodeExtractor instance where isEligible
Index: lib/Transforms/IPO/PartialInlining.cpp
===================================================================
--- lib/Transforms/IPO/PartialInlining.cpp
+++ lib/Transforms/IPO/PartialInlining.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -42,6 +43,10 @@
 static cl::opt<bool>
     DisablePartialInlining("disable-partial-inlining", cl::init(false),
                            cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool>
+    SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::init(false),
+                           cl::Hidden, cl::desc("Skip Cost Analysis"));
 
 static cl::opt<unsigned> MaxNumInlineBlocks(
     "max-num-inline-blocks", cl::init(5), cl::Hidden,
@@ -84,8 +89,6 @@
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
-  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
-
 private:
   int NumPartialInlining = 0;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
@@ -98,6 +101,41 @@
     return (MaxNumPartialInlining != -1 &&
             NumPartialInlining >= MaxNumPartialInlining);
   }
+  CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+  CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns true of the benefit of eliminating the call outweights the
+  // additional runtime cost associated with the new call to the outlined
+  // function (extracted).
+  bool isPartialInliningBeneficial(Function *F, FunctionOutliningInfo *OI);
+  // Returns the cost associated with function outlining:
+  // - The first value is the non-weighted runtime cost for the call sequence
+  //   to the outlined function;
+  // - The second value is the code size estimate of the new call sequence.
+  // - The third value is the size estimate of the original code that is
+  //   extracted into the outlined function.
+  std::tuple<int, int, int>
+  computeOutliningCost(Function *F, const FunctionOutliningInfo *OutliningInfo);
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
 };
 
 struct PartialInlinerLegacyPass : public ModulePass {
@@ -293,6 +331,9 @@
                                              OptimizationRemarkEmitter &ORE) {
   // TODO : more sharing with shouldInline in Inliner.cpp
   using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
@@ -316,7 +357,7 @@
   }
 
   if (!IC) {
-    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
              << NV("Callee", Callee) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
@@ -332,6 +373,102 @@
   return true;
 }
 
+std::tuple<int, int, int>
+PartialInlinerImpl::computeOutliningCost(Function *F,
+                                         const FunctionOutliningInfo *OI) {
+  std::vector<BasicBlock *> OutlinedRegion;
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(F, &(*GetAssumptionCache)(*F), EphValues);
+
+  int OutlinedRegionSize = 0;
+  for (BasicBlock &BB : *F) {
+    if (&BB != OI->ReturnBlock &&
+        // Assuming Entry set is small -- do a linear search here.
+        std::find(OI->Entries.begin(), OI->Entries.end(), &BB) ==
+            OI->Entries.end()) {
+      OutlinedRegion.push_back(&BB);
+      for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+        if (isa<DbgInfoIntrinsic>(I))
+          continue;
+        if (EphValues.count(&*I))
+          continue;
+        OutlinedRegionSize += InlineConstants::InstrCost;
+      }
+    }
+  }
+
+  int OutliningSizeCost, OutliningRuntimeCost;
+  std::tie(OutliningSizeCost, OutliningRuntimeCost) =
+      CodeExtractor(OutlinedRegion, nullptr, false, nullptr, nullptr)
+          .computeOutliningCost();
+  return std::make_tuple(OutliningSizeCost, OutliningRuntimeCost,
+                         OutlinedRegionSize);
+}
+
+bool PartialInlinerImpl::isPartialInliningBeneficial(
+    Function *F, FunctionOutliningInfo *OI) {
+  int NonWeightedRcost;
+  int SizeCost;
+  int OutlinedRegionSize;
+
+  if (SkipCostAnalysis)
+    return true;
+
+  std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSize) =
+      computeOutliningCost(F, OI);
+
+  OptimizationRemarkEmitter ORE(F);
+  using namespace ore;
+  DebugLoc DLoc;
+  BasicBlock *Block;
+  std::tie(DLoc, Block) = getOneDebugLoc(F);
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // 'F' with outlining (The inliner using the size increase to model the
+  // the cost of inlining a callee).
+  if (OutlinedRegionSize < SizeCost) {
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << NV("Function", F)
+             << " not partially inlined into callers (Original Size = "
+             << NV("OutlinedRegionOriginalSize", OutlinedRegionSize)
+             << ", Size of call sequence to outlined function = "
+             << NV("NewSize", SizeCost) << ")");
+    return false;
+  }
+
+  // Now compare with weighted runtime cost:
+  LoopInfo LI{DominatorTree(*F)};
+  BranchProbabilityInfo BPI(*F, LI);
+  BlockFrequencyInfo BFI(*F, BPI, LI);
+
+  BlockFrequency EntryFreq = BFI.getBlockFreq(&F->getEntryBlock());
+  BlockFrequency OutlinedCallFreq = BFI.getBlockFreq(OI->NonReturnBlock);
+  CallSite CS = getOneCallSiteTo(F);
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  int Denom = (NonWeightedSavings > NonWeightedRcost ? NonWeightedSavings
+                                                     : NonWeightedRcost);
+  BranchProbability NormWeightedSavings(NonWeightedSavings, Denom);
+  BranchProbability NormWeightedRcost(NonWeightedRcost, Denom);
+  // Weighted saving is smaller than weighted cost, return false
+  if (EntryFreq * NormWeightedSavings < OutlinedCallFreq * NormWeightedRcost) {
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
+                                        DLoc, Block)
+             << NV("Function", F)
+             << " not partially inlined into callers, runtime overhead "
+             << " of making outlined call is too high");
+
+    return false;
+  }
+
+  // Ok, we can partial inline this function.
+  return true;
+}
+
 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
 
   if (F->hasAddressTaken())
@@ -347,12 +484,18 @@
   if (PSI->isFunctionEntryCold(F))
     return nullptr;
 
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
       computeOutliningInfo(F);
 
   if (!OutliningInfo)
     return nullptr;
 
+  if (!isPartialInliningBeneficial(F, OutliningInfo.get()))
+    return nullptr;
+
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
   Function *DuplicateFunction = CloneFunction(F, VMap);
@@ -452,13 +595,7 @@
                             DuplicateFunction->user_end());
 
   for (User *User : Users) {
-    CallSite CS;
-    if (CallInst *CI = dyn_cast<CallInst>(User))
-      CS = CallSite(CI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
-      CS = CallSite(II);
-    else
-      llvm_unreachable("All uses must be calls");
+    CallSite CS = getCallSite(User);
 
     if (IsLimitReached())
       continue;
Index: lib/Transforms/Utils/CodeExtractor.cpp
===================================================================
--- lib/Transforms/Utils/CodeExtractor.cpp
+++ lib/Transforms/Utils/CodeExtractor.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
@@ -655,6 +656,57 @@
   }
 }
 
+// The cost of outlining the region consists of the following parts:
+//   0. The new call/return instruction
+//   1. Input parmater passing
+//   2. Output parameter stack allocation in caller
+//   3. Output parameter pointer passing
+//   4. Output parameter stores (at the exit of the outlined function)
+//   5. Output parmaeter loads (right after the call to the outlined function)
+//   6. For multiple exits region, the branch cost (conditional branch) incurred
+//      aftter the call to outlined function
+// The first returned value is the size cost and the second is estimated
+// runtime cost. The runtime cost models the increased runtime overhead
+// due to the newly created instructions, while the size cost models the
+// size impact of the call sequence to the outlined function.
+std::tuple<int, int> CodeExtractor::computeOutliningCost() {
+  ValueSet inputs, outputs;
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+
+  if (!isEligible())
+    return std::make_tuple(0, 0);
+
+  int RCost = 0, SCost = 0;
+
+  for (BasicBlock *Block : Blocks) {
+    if (dyn_cast<ReturnInst>(Block->getTerminator())) {
+      NumExitBlocks++;
+      continue;
+    }
+    for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+         ++SI) {
+      if (!Blocks.count(*SI))
+        NumExitBlocks++;
+    }
+  }
+
+  // Find inputs to, outputs from the code region.
+  findInputsOutputs(inputs, outputs);
+  // TODO: check parameter attribute, callingConv etc.
+  RCost = (inputs.size() + outputs.size()) * InlineConstants::InstrCost;
+  // For each output parameter, there is cost associated with stack
+  // usage, before return stores, and after call re-loads:
+  RCost += 3 * outputs.size() * InlineConstants::InstrCost;
+  // Branch cost:
+  RCost += NumExitBlocks;
+
+  SCost = RCost;
+  // Now the cost of calling the outlined function itself:
+  RCost += InlineConstants::CallPenalty;
+  SCost += InlineConstants::InstrCost;
+  return std::make_tuple(SCost, RCost);
+}
+
 void CodeExtractor::moveCodeToFunction(Function *newFunction) {
   Function *oldFunc = (*Blocks.begin())->getParent();
   Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
Index: test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
===================================================================
--- test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
+++ test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that the CodeExtractor
 ;  properly sets the entry count for the function that is
Index: test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
===================================================================
--- test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
+++ test/Transforms/CodeExtractor/MultipleExitBranchProb.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -max-num-inline-blocks=2 -skip-partial-inlining-cost-analysis -S | FileCheck %s
 
 ; This test checks to make sure that CodeExtractor updates
 ;  the exit branch probabilities for multiple exit blocks.
Index: test/Transforms/CodeExtractor/PartialInlineHighCost.ll
===================================================================
--- test/Transforms/CodeExtractor/PartialInlineHighCost.ll
+++ test/Transforms/CodeExtractor/PartialInlineHighCost.ll
@@ -0,0 +1,63 @@
+; The outlined region has high frequency  and the outlining
+; call sequence is expensive (input, output, multiple exit etc)
+; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -passes=partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck --check-prefix=NOCOST %s
+; RUN: opt < %s -passes=partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck  --check-prefix=NOCOST %s
+
+
+; Function Attrs: nounwind
+define i32 @bar(i32 %arg) local_unnamed_addr #0 {
+bb:
+  %tmp = icmp slt i32 %arg, 0
+  br i1 %tmp, label %bb1, label %bb16, !prof !1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = tail call i32 (...) @foo() #0
+  %tmp3 = tail call i32 (...) @foo() #0
+  %tmp4 = tail call i32 (...) @foo() #0
+  %tmp5 = tail call i32 (...) @foo() #0
+  %tmp6 = tail call i32 (...) @foo() #0
+  %tmp7 = tail call i32 (...) @foo() #0
+  %tmp8 = add nsw i32 %arg, 1
+  %tmp9 = tail call i32 @goo(i32 %tmp8) #0
+  %tmp10 = tail call i32 (...) @foo() #0
+  %tmp11 = icmp eq i32 %tmp10, 0
+  br i1 %tmp11, label %bb12, label %bb16
+
+bb12:                                             ; preds = %bb1
+  %tmp13 = tail call i32 (...) @foo() #0
+  %tmp14 = icmp eq i32 %tmp13, 0
+  %tmp15 = select i1 %tmp14, i32 0, i32 3
+  br label %bb16
+
+bb16:                                             ; preds = %bb12, %bb1, %bb
+  %tmp17 = phi i32 [ 2, %bb1 ], [ %tmp15, %bb12 ], [ 0, %bb ]
+  ret i32 %tmp17
+}
+
+; Function Attrs: nounwind
+declare i32 @foo(...) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+declare i32 @goo(i32) local_unnamed_addr #0
+
+; Function Attrs: nounwind
+define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
+bb:
+; CHECK-LABEL: @dummy_caller
+; CHECK-NOT: br i1
+; CHECK-NOT: call{{.*}}bar. 
+; NOCOST-LABEL: @dummy_caller
+; NOCOST: br i1
+; NOCOST: call{{.*}}bar.
+  %tmp = tail call i32 @bar(i32 %arg)
+  ret i32 %tmp
+}
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 301898)"}
+!1 = !{!"branch_weights", i32 2000, i32 1}
Index: test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
===================================================================
--- test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
+++ test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -partial-inliner | llc -filetype=null
-; RUN: opt < %s -partial-inliner -S | FileCheck %s
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis | llc -filetype=null
+; RUN: opt < %s -partial-inliner -skip-partial-inlining-cost-analysis -S | FileCheck %s
 ; This testcase checks to see if CodeExtractor properly inherits
 ;   target specific attributes for the extracted function. This can
 ;   cause certain instructions that depend on the attributes to not