Index: include/llvm/Analysis/InlineCost.h
===================================================================
--- include/llvm/Analysis/InlineCost.h
+++ include/llvm/Analysis/InlineCost.h
@@ -33,9 +33,9 @@
 // Various thresholds used by inline cost analysis.
 /// Use when optsize (-Os) is specified.
 const int OptSizeThreshold = 50;
-
 /// Use when minsize (-Oz) is specified.
 const int OptMinSizeThreshold = 5;
+const int OptMinSizeGrowthPenalty = 1;
 
 /// Use when -O3 is specified.
 const int OptAggressiveThreshold = 250;
Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -230,6 +230,9 @@
   InlineResult analyzeBlock(BasicBlock *BB,
                             SmallPtrSetImpl<const Value *> &EphValues);
 
+  /// Cost growth analysis
+  int getGrowthCost();
+
   // Disable several entry points to the visitor so we don't accidentally use
   // them by declaring but not defining them here.
   void visit(Module *);
@@ -279,7 +282,8 @@
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
-                                       Params.ComputeFullInlineCost || ORE),
+                                       Params.ComputeFullInlineCost || ORE ||
+                                       F.optForMinSize()),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
@@ -1879,6 +1883,8 @@
 
   bool OnlyOneCallAndLocalLinkage =
       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
+
+  Cost += getGrowthCost();
   // If this is a noduplicate call, we can still inline as long as
   // inlining this would cause the removal of the caller (so the instruction
   // is not actually duplicated, just moved).
@@ -1896,6 +1902,28 @@
   return Cost < std::max(1, Threshold);
 }
 
+int CallAnalyzer::getGrowthCost() {
+  // If the function of size S is called N times and each inlining cause a
+  // code growth of X. The expected growth for inlining it N times is
+  // N * X.
+  // If the function has local linkage, then the cost would actually be
+  // (N * X) - S, as the function would be eliminated.
+  // The expected Growth lowers each time the function is inlined, as N
+  // reduces as well.
+  if (!F.optForMinSize())
+    return 0;
+
+  int Growth = (NumInstructions - NumInstructionsSimplified) * F.getNumUses();
+
+  // Test that we can eliminate the function, and give a bonus
+  // TODO: Perhaps we could add smaller bonus if the function can
+  // isDiscardableIfUnused()
+  if (F.hasLocalLinkage())
+    Growth -= NumInstructions;
+
+  return Growth * InlineConstants::OptMinSizeGrowthPenalty;
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Dump stats about this call's analysis.
 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
Index: test/Transforms/Inline/InlineGrowthCost.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/InlineGrowthCost.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -inline %s -o - | FileCheck %s --implicit-check-not "@DoInline"
+define dso_local i8* @ShouldNotInlined(i8* returned %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+while.cond:
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+while.body:
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = sub i8 %1, %0
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+while.end:
+  ret i8* %dest
+}
+define dso_local void @Caller(i8* %p1, i8* nocapture %p2, i32 %n) #0 {
+; CHECK-LABEL: @Caller(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = lshr i32 [[N:%.*]], 1
+; CHECK-NEXT:    tail call fastcc void @DoNotInline(i8* [[P1:%.*]], i8* [[P2:%.*]], i32 [[DIV]]) #0
+; CHECK-NEXT:    [[DIV1:%.*]] = lshr i32 [[N]], 2
+; CHECK-NEXT:    tail call fastcc void @DoNotInline(i8* [[P2]], i8* [[P1]], i32 [[DIV1]]) #0
+; CHECK-NEXT:    [[DIV3:%.*]] = lshr i32 [[N]], 3
+; CHECK-NEXT:    tail call fastcc void @DoNotInline(i8* [[P1]], i8* [[P2]], i32 [[DIV3]]) #0
+; CHECK-NEXT:    [[DIV5:%.*]] = lshr i32 [[N]], 4
+; CHECK-NEXT:    tail call fastcc void @DoNotInline(i8* [[P2]], i8* [[P1]], i32 [[DIV5]]) #0
+; CHECK-NEXT:    [[DIV7:%.*]] = lshr i32 [[N]], 5
+; CHECK-NEXT:    tail call fastcc void @DoNotInline(i8* [[P1]], i8* [[P2]], i32 [[DIV7]]) #0
+; CHECK-NEXT:    [[DIV9:%.*]] = lshr i32 [[N]], 6
+; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
+; CHECK:       while.cond.i:
+; CHECK-NEXT:    [[NUM_ADDR_0_I:%.*]] = phi i32 [ [[DIV9]], [[ENTRY:%.*]] ], [ [[DEC_I:%.*]], [[WHILE_BODY_I:%.*]] ]
+; CHECK-NEXT:    [[P_DEST_0_I:%.*]] = phi i8* [ [[P2]], [[ENTRY]] ], [ [[INCDEC_PTR2_I:%.*]], [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[P_SOURCE_0_I:%.*]] = phi i8* [ [[P1]], [[ENTRY]] ], [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[NUM_ADDR_0_I]], 0
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[DOINLINE_EXIT:%.*]], label [[WHILE_BODY_I]]
+; CHECK:       while.body.i:
+; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, i8* [[P_SOURCE_0_I]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[P_SOURCE_0_I]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[NUM_ADDR_0_I]] to i8
+; CHECK-NEXT:    [[CONV1_I:%.*]] = sub i8 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[INCDEC_PTR2_I]] = getelementptr inbounds i8, i8* [[P_DEST_0_I]], i32 1
+; CHECK-NEXT:    store i8 [[CONV1_I]], i8* [[P_DEST_0_I]], align 1
+; CHECK-NEXT:    [[DEC_I]] = add i32 [[NUM_ADDR_0_I]], -1
+; CHECK-NEXT:    br label [[WHILE_COND_I]]
+; CHECK:       DoInline.exit:
+; CHECK-NEXT:    [[DIV11:%.*]] = lshr i32 [[N]], 7
+; CHECK-NEXT:    br label [[WHILE_COND_I15:%.*]]
+; CHECK:       while.cond.i15:
+; CHECK-NEXT:    [[NUM_ADDR_0_I11:%.*]] = phi i32 [ [[DIV11]], [[DOINLINE_EXIT]] ], [ [[DEC_I19:%.*]], [[WHILE_BODY_I20:%.*]] ]
+; CHECK-NEXT:    [[P_DEST_0_I12:%.*]] = phi i8* [ [[P2]], [[DOINLINE_EXIT]] ], [ [[INCDEC_PTR2_I18:%.*]], [[WHILE_BODY_I20]] ]
+; CHECK-NEXT:    [[P_SOURCE_0_I13:%.*]] = phi i8* [ [[P1]], [[DOINLINE_EXIT]] ], [ [[INCDEC_PTR_I16:%.*]], [[WHILE_BODY_I20]] ]
+; CHECK-NEXT:    [[CMP_I14:%.*]] = icmp eq i32 [[NUM_ADDR_0_I11]], 0
+; CHECK-NEXT:    br i1 [[CMP_I14]], label [[DOINLINE_EXIT21:%.*]], label [[WHILE_BODY_I20]]
+; CHECK:       while.body.i20:
+; CHECK-NEXT:    [[INCDEC_PTR_I16]] = getelementptr inbounds i8, i8* [[P_SOURCE_0_I13]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[P_SOURCE_0_I13]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[NUM_ADDR_0_I11]] to i8
+; CHECK-NEXT:    [[CONV1_I17:%.*]] = sub i8 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[INCDEC_PTR2_I18]] = getelementptr inbounds i8, i8* [[P_DEST_0_I12]], i32 1
+; CHECK-NEXT:    store i8 [[CONV1_I17]], i8* [[P_DEST_0_I12]], align 1
+; CHECK-NEXT:    [[DEC_I19]] = add i32 [[NUM_ADDR_0_I11]], -1
+; CHECK-NEXT:    br label [[WHILE_COND_I15]]
+; CHECK:       DoInline.exit21:
+; CHECK-NEXT:    br label [[WHILE_COND_I5:%.*]]
+; CHECK:       while.cond.i5:
+; CHECK-NEXT:    [[NUM_ADDR_0_I1:%.*]] = phi i32 [ [[N]], [[DOINLINE_EXIT21]] ], [ [[DEC_I9:%.*]], [[WHILE_BODY_I10:%.*]] ]
+; CHECK-NEXT:    [[P_DEST_0_I2:%.*]] = phi i8* [ [[P1]], [[DOINLINE_EXIT21]] ], [ [[INCDEC_PTR2_I8:%.*]], [[WHILE_BODY_I10]] ]
+; CHECK-NEXT:    [[P_SOURCE_0_I3:%.*]] = phi i8* [ [[P2]], [[DOINLINE_EXIT21]] ], [ [[INCDEC_PTR_I6:%.*]], [[WHILE_BODY_I10]] ]
+; CHECK-NEXT:    [[CMP_I4:%.*]] = icmp eq i32 [[NUM_ADDR_0_I1]], 0
+; CHECK-NEXT:    br i1 [[CMP_I4]], label [[SHOULDNOTINLINED_EXIT:%.*]], label [[WHILE_BODY_I10]]
+; CHECK:       while.body.i10:
+; CHECK-NEXT:    [[INCDEC_PTR_I6]] = getelementptr inbounds i8, i8* [[P_SOURCE_0_I3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[P_SOURCE_0_I3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[NUM_ADDR_0_I1]] to i8
+; CHECK-NEXT:    [[CONV1_I7:%.*]] = sub i8 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[INCDEC_PTR2_I8]] = getelementptr inbounds i8, i8* [[P_DEST_0_I2]], i32 1
+; CHECK-NEXT:    store i8 [[CONV1_I7]], i8* [[P_DEST_0_I2]], align 1
+; CHECK-NEXT:    [[DEC_I9]] = add i32 [[NUM_ADDR_0_I1]], -1
+; CHECK-NEXT:    br label [[WHILE_COND_I5]]
+; CHECK:       ShouldNotInlined.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %div = lshr i32 %n, 1
+  tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div) #0
+  %div1 = lshr i32 %n, 2
+  tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div1) #0
+  %div3 = lshr i32 %n, 3
+  tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div3) #0
+  %div5 = lshr i32 %n, 4
+  tail call fastcc void @DoNotInline(i8* %p2, i8* %p1, i32 %div5) #0
+  %div7 = lshr i32 %n, 5
+  tail call fastcc void @DoNotInline(i8* %p1, i8* %p2, i32 %div7) #0
+  %div9 = lshr i32 %n, 6
+  tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div9) #0
+  %div11 = lshr i32 %n, 7
+  tail call fastcc void @DoInline(i8* %p2, i8* %p1, i32 %div11) #0
+  %call13 = tail call i8* @ShouldNotInlined(i8* %p1, i8* %p2, i32 %n) #0
+  ret void
+}
+define internal fastcc void @DoNotInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+while.cond:
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+while.body:
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+while.end:
+  ret void
+}
+define internal fastcc void @DoInline(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+; CHECK-NOT-LABEL: @DoInline
+entry:
+  br label %while.cond
+while.cond:
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+while.body:
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = sub i8 %1, %0
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+while.end:
+  ret void
+}
+attributes #0 = { minsize optsize }