Index: include/llvm/Analysis/InlineCost.h
===================================================================
--- include/llvm/Analysis/InlineCost.h
+++ include/llvm/Analysis/InlineCost.h
@@ -44,6 +44,7 @@
 const int InstrCost = 5;
 const int IndirectCallThreshold = 100;
 const int CallPenalty = 25;
+const int CallMinSizePenalty = 15;
 const int LastCallToStaticBonus = 15000;
 const int ColdccPenalty = 2000;
 const int NoreturnPenalty = 10000;
@@ -201,7 +202,7 @@
 
 /// Return the cost associated with a callsite, including parameter passing
 /// and the call/return instruction.
-int getCallsiteCost(CallSite CS, const DataLayout &DL);
+int getCallsiteCost(CallSite CS, const DataLayout &DL, int CallPenalty);
 
 /// Get an InlineCost object representing the cost of inlining this
 /// callsite.
Index: lib/Analysis/InlineCost.cpp
===================================================================
--- lib/Analysis/InlineCost.cpp
+++ lib/Analysis/InlineCost.cpp
@@ -129,6 +129,11 @@
   int Cost;
   bool ComputeFullInlineCost;
 
+  /// The penalty for making calls. Defaults to InlineCost:CallPenalty, but
+  /// lower at minsize (still not zero due to the inaccuracies in cost
+  /// modelling).
+  int CallPenalty;
+
   bool IsCallerRecursive;
   bool IsRecursiveCall;
   bool ExposesReturnsTwice;
@@ -280,16 +285,16 @@
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
                                        Params.ComputeFullInlineCost || ORE),
-        IsCallerRecursive(false), IsRecursiveCall(false),
-        ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
-        HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0),
-        NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
-        SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
-        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
-        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
-        NumInstructionsSimplified(0), SROACostSavings(0),
-        SROACostSavingsLost(0) {}
+        CallPenalty(InlineConstants::CallPenalty), IsCallerRecursive(false),
+        IsRecursiveCall(false), ExposesReturnsTwice(false),
+        HasDynamicAlloca(false), ContainsNoDuplicateCall(false),
+        HasReturn(false), HasIndirectBr(false), HasUninlineableIntrinsic(false),
+        InitsVargArgs(false), AllocatedSize(0), NumInstructions(0),
+        NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
+        EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
+        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+        SROACostSavings(0), SROACostSavingsLost(0) {}
 
   InlineResult analyzeCall(CallSite CS);
 
@@ -585,6 +590,12 @@
       SROAArgValues[&I] = SROAArg;
 
     // Constant GEPs are modeled as free.
+    // Except at minsize, we don't count those used in PHI nodes.
+    if (I.getParent()->getParent()->optForMinSize()) {
+      for (auto U : I.users())
+        if (isa<PHINode>(U))
+          return false;
+    }
     return true;
   }
 
@@ -719,7 +730,7 @@
   case Instruction::FPToUI:
   case Instruction::FPToSI:
     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
-      Cost += InlineConstants::CallPenalty;
+      Cost += CallPenalty;
   default:
     break;
   }
@@ -897,6 +908,7 @@
   // and reduce the threshold if the caller has the necessary attribute.
   if (Caller->optForMinSize()) {
     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
+    CallPenalty = InlineConstants::CallMinSizePenalty;
     // For minsize, we want to disable the single BB bonus and the vector
     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
     // a static function will, at the minimum, eliminate the parameter setup and
@@ -1087,7 +1099,7 @@
   // as such.
   if (I.getType()->isFloatingPointTy() &&
       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
-    Cost += InlineConstants::CallPenalty;
+    Cost += CallPenalty;
 
   return false;
 }
@@ -1259,7 +1271,7 @@
       // Everything other than inline ASM will also have a significant cost
       // merely from making the call.
       if (!isa<InlineAsm>(CS.getCalledValue()))
-        Cost += InlineConstants::CallPenalty;
+        Cost += CallPenalty;
     }
 
     if (!CS.onlyReadsMemory())
@@ -1735,7 +1747,7 @@
 
   // Give out bonuses for the callsite, as the instructions setting them up
   // will be gone after inlining.
-  Cost -= getCallsiteCost(CS, DL);
+  Cost -= getCallsiteCost(CS, DL, CallPenalty);
 
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
@@ -1825,6 +1837,30 @@
     if (BB->hasAddressTaken())
       return "blockaddress";
 
+    // Count the number of unconditional branches leading to this block. Add an
+    // instruction cost for each past the first. This models the cost of
+    // branches to the block.
+    if (Caller->optForMinSize()) {
+      bool FoundUnconditional = false;
+      for (auto *PredBB : predecessors(BB)) {
+        // Discount edges we know to not be taken
+        if (DeadBlocks.count(PredBB))
+          continue;
+        BasicBlock *KnownSuccessor = KnownSuccessors[PredBB];
+        if (KnownSuccessor && KnownSuccessor != BB)
+          continue;
+
+        Instruction *TI = PredBB->getTerminator();
+        if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+          if (!BI->isConditional()) {
+            if (FoundUnconditional)
+              Cost += InlineConstants::InstrCost;
+            FoundUnconditional = true;
+          }
+        }
+      }
+    }
+
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     InlineResult IR = analyzeBlock(BB, EphValues);
@@ -1925,7 +1961,7 @@
          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
 }
 
-int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
+int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL, int CallPenalty) {
   int Cost = 0;
   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
     if (CS.isByValArgument(I)) {
@@ -1954,7 +1990,7 @@
     }
   }
   // The call instruction also disappears after inlining.
-  Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
+  Cost += InlineConstants::InstrCost + CallPenalty;
   return Cost;
 }
 
Index: lib/Transforms/IPO/PartialInlining.cpp
===================================================================
--- lib/Transforms/IPO/PartialInlining.cpp
+++ lib/Transforms/IPO/PartialInlining.cpp
@@ -798,7 +798,7 @@
   const DataLayout &DL = Caller->getParent()->getDataLayout();
 
   // The savings of eliminating the call:
-  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  int NonWeightedSavings = getCallsiteCost(CS, DL, InlineConstants::CallPenalty);
   BlockFrequency NormWeightedSavings(NonWeightedSavings);
 
   // Weighted saving is smaller than weighted cost, return false
@@ -860,12 +860,12 @@
     }
 
     if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      InlineCost += getCallsiteCost(CallSite(CI), DL);
+      InlineCost += getCallsiteCost(CallSite(CI), DL, InlineConstants::CallPenalty);
       continue;
     }
 
     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
-      InlineCost += getCallsiteCost(CallSite(II), DL);
+      InlineCost += getCallsiteCost(CallSite(II), DL, InlineConstants::CallPenalty);
       continue;
     }
 
Index: test/Transforms/Inline/AArch64/phi.ll
===================================================================
--- test/Transforms/Inline/AArch64/phi.ll
+++ test/Transforms/Inline/AArch64/phi.ll
@@ -345,7 +345,7 @@
   br label %exit
 
 exit:
-  %phi = phi i32* [%gep1, %entry], [%gep3, %if_true] ; Simplifeid to %gep1
+  %phi = phi i32* [%gep1, %entry], [%gep3, %if_true] ; Simplified to %gep1
   %load = load i32, i32* %phi
   call void @pad()
   ret i32 %load
Index: test/Transforms/Inline/ARM/loop-add.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/ARM/loop-add.ll
@@ -0,0 +1,95 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: void @doCalls
+define void @doCalls(i8* nocapture %p1, i8* nocapture %p2, i32 %n) #0 {
+entry:
+  %div = lshr i32 %n, 1
+; CHECK: call void @LoopCall
+  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div) #0
+
+  %div2 = lshr i32 %n, 2
+; CHECK: call void @LoopCall
+  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div2) #0
+
+; CHECK-NOT: call void @LoopCall
+  tail call void @LoopCall(i8* %p2, i8* %p1, i32 0) #0
+
+; CHECK-NOT: call void @LoopCall_internal
+  tail call void @LoopCall_internal(i8* %p1, i8* %p2, i32 %div2) #0
+
+  %div3 = lshr i32 %n, 4
+; CHECK-NOT: call void @SimpleCall
+  tail call void @SimpleCall(i8* %p2, i8* %p1, i32 %div3) #0
+  ret void
+}
+
+; CHECK-LABEL: define void @LoopCall
+define void @LoopCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %c = icmp ne i32 %num, 0
+  br i1 %c, label %while.cond, label %while.end
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; CHECK-LABEL-NOT: define void @LoopCall_internal
+define internal void @LoopCall_internal(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %c = icmp ne i32 %num, 0
+  br i1 %c, label %while.cond, label %while.end
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @SimpleCall
+define void @SimpleCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %source, i32 %num
+  %0 = load i8, i8* %arrayidx, align 1
+  %1 = xor i8 %0, 127
+  %arrayidx2 = getelementptr inbounds i8, i8* %dest, i32 %num
+  store i8 %1, i8* %arrayidx2, align 1
+  ret void
+}
+
+attributes #0 = { minsize optsize }
+
Index: test/Transforms/Inline/ARM/loop-memcpy.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/ARM/loop-memcpy.ll
@@ -0,0 +1,87 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: define void @matcpy
+define void @matcpy(i8* %dest, i8* %source, i32 %num) #0 {
+entry:
+  %0 = ptrtoint i8* %dest to i32
+  %1 = ptrtoint i8* %source to i32
+  %2 = xor i32 %0, %1
+  %3 = and i32 %2, 3
+  %cmp = icmp eq i32 %3, 0
+  br i1 %cmp, label %if.then, label %if.else20
+
+if.then:                                          ; preds = %entry
+  %sub = sub i32 0, %0
+  %and2 = and i32 %sub, 3
+  %add = or i32 %and2, 4
+  %cmp3 = icmp ugt i32 %add, %num
+  br i1 %cmp3, label %if.else, label %if.then4
+
+if.then4:                                         ; preds = %if.then
+  %sub5 = sub i32 %num, %and2
+  %shr = and i32 %sub5, -4
+  %sub7 = sub i32 %sub5, %shr
+  %tobool = icmp eq i32 %and2, 0
+  br i1 %tobool, label %if.end, label %if.then8
+
+if.then8:                                         ; preds = %if.then4
+; CHECK: call fastcc void @memcpy
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %and2) #0
+  %add.ptr = getelementptr inbounds i8, i8* %dest, i32 %and2
+  %add.ptr9 = getelementptr inbounds i8, i8* %source, i32 %and2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then4, %if.then8
+  %p_dest.0 = phi i8* [ %add.ptr, %if.then8 ], [ %dest, %if.then4 ]
+  %p_source.0 = phi i8* [ %add.ptr9, %if.then8 ], [ %source, %if.then4 ]
+  %tobool14 = icmp eq i32 %sub7, 0
+  br i1 %tobool14, label %if.end22, label %if.then15
+
+if.then15:                                        ; preds = %if.end
+  %add.ptr13 = getelementptr inbounds i8, i8* %p_source.0, i32 %shr
+  %add.ptr11 = getelementptr inbounds i8, i8* %p_dest.0, i32 %shr
+; CHECK: call fastcc void @memcpy
+  call fastcc void @memcpy(i8* %add.ptr11, i8* %add.ptr13, i32 %sub7) #0
+  br label %if.end22
+
+if.else:                                          ; preds = %if.then
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
+  br label %if.end22
+
+if.else20:                                        ; preds = %entry
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
+  br label %if.end22
+
+if.end22:                                         ; preds = %if.then15, %if.end, %if.else, %if.else20
+  ret void
+}
+
+; CHECK-LABEL: define internal void @memcpy
+define internal void @memcpy(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %0, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+attributes #0 = { minsize optsize }
+
Index: test/Transforms/Inline/ARM/loop-noinline.ll
===================================================================
--- /dev/null
+++ test/Transforms/Inline/ARM/loop-noinline.ll
@@ -0,0 +1,49 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; Check we don't inline loops at -Oz. They tend to be larger than we
+; expect.
+
+; CHECK: define i8* @H
+@digits = constant [16 x i8] c"0123456789ABCDEF", align 1
+define i8* @H(i8* %p, i32 %val, i32 %num) #0 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %do.body ]
+  %val.addr.0 = phi i32 [ %val, %entry ], [ %shl, %do.body ]
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %do.body ]
+  %shr = lshr i32 %val.addr.0, 28
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* @digits, i32 0, i32 %shr
+  %0 = load i8, i8* %arrayidx, align 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i32 1
+  store i8 %0, i8* %p.addr.0, align 1
+  %shl = shl i32 %val.addr.0, 4
+  %dec = add i32 %num.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %scevgep = getelementptr i8, i8* %p, i32 %num
+  ret i8* %scevgep
+}
+
+define nonnull i8* @call1(i8* %p, i32 %val, i32 %num) #0 {
+entry:
+; CHECK: tail call i8* @H
+  %call = tail call i8* @H(i8* %p, i32 %val, i32 %num) #0
+  ret i8* %call
+}
+
+define nonnull i8* @call2(i8* %p, i32 %val) #0 {
+entry:
+; CHECK: tail call i8* @H
+  %call = tail call i8* @H(i8* %p, i32 %val, i32 32) #0
+  ret i8* %call
+}
+
+attributes #0 = { minsize optsize }
+