Index: include/llvm/Analysis/InlineCost.h
===================================================================
--- include/llvm/Analysis/InlineCost.h
+++ include/llvm/Analysis/InlineCost.h
@@ -33,9 +33,9 @@
 // Various thresholds used by inline cost analysis.
 /// Use when optsize (-Os) is specified.
 const int OptSizeThreshold = 50;
-
 /// Use when minsize (-Oz) is specified.
 const int OptMinSizeThreshold = 5;
+const int OptMinSizeGrowthPenalty = 1;
 
 /// Use when -O3 is specified.
 const int OptAggressiveThreshold = 250;
Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -230,6 +230,9 @@
   InlineResult analyzeBlock(BasicBlock *BB,
                             SmallPtrSetImpl<const Value *> &EphValues);
 
+  /// Cost growth analysis
+  int getGrowthCost();
+
   // Disable several entry points to the visitor so we don't accidentally use
   // them by declaring but not defining them here.
   void visit(Module *);
@@ -279,7 +282,8 @@
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
-                                       Params.ComputeFullInlineCost || ORE),
+                                       Params.ComputeFullInlineCost || ORE ||
+                                       F.optForMinSize()),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
@@ -1879,6 +1883,8 @@
 
   bool OnlyOneCallAndLocalLinkage =
       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
+
+  Cost += getGrowthCost();
   // If this is a noduplicate call, we can still inline as long as
   // inlining this would cause the removal of the caller (so the instruction
   // is not actually duplicated, just moved).
@@ -1896,6 +1902,25 @@
   return Cost < std::max(1, Threshold);
 }
 
+int CallAnalyzer::getGrowthCost() {
+  // If the function of size S is called N times and each inlining cause a
+  // code growth of X.
+  // Then, the actual code-size growth is S - (N * X). We assume that the
+  // code growth is constant when inlining the same function.
+  if (!F.optForMinSize())
+    return 0;
+
+  int Growth = (NumInstructions - NumInstructionsSimplified) * F.getNumUses();
+
+  // Test that we can eliminate the function, and give a bonus
+  // TODO: Perhaps we could add smaller bonus if the function can
+  // isDiscardableIfUnused()
+  if (F.hasLocalLinkage())
+    Growth -= NumInstructions;
+
+  return Growth * InlineConstants::OptMinSizeGrowthPenalty;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Dump stats about this call's analysis.
 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
Index: test/Transforms/Inline/InlineGrowthCost.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/InlineGrowthCost.ll
@@ -0,0 +1,85 @@
+; RUN: opt -S -inline %s -o - | grep "tail call fastcc void @DoNotInline"
+; RUN: opt -S -inline %s -o - | not grep "tail call fastcc void @DoInline"
+define dso_local i8* @ShouldNotInlined(i8* returned %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+while.cond:
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+while.body:
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = sub i8 %1, %0
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+while.end:
+  ret i8* %dest
+}
+define dso_local void @Caller(i8* %p1, i8* nocapture %p2, i32 %n) #0 {
+entry:
+  %div = lshr i32 %n, 1
+  tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div) #0
+  %div1 = lshr i32 %n, 2
+  tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div1) #0
+  %div3 = lshr i32 %n, 3
+  tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div3) #0
+  %div5 = lshr i32 %n, 4
+  tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div5) #0
+  %div7 = lshr i32 %n, 5
+  tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div7) #0
+  %div9 = lshr i32 %n, 6
+  tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div9) #0
+  %div11 = lshr i32 %n, 7
+  tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div11) #0
+  %call13 = tail call i8* @ShouldNotInlined(i8* %p1, i8* %p2, i32 %n) #0
+  ret void
+}
+define internal fastcc void @DoNotInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+while.cond:
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+while.body:
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+while.end:
+  ret void
+}
+define internal fastcc void @DoInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+while.cond:
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+while.body:
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = sub i8 %1, %0
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+while.end:
+  ret void
+}
+attributes #0 = { minsize optsize }