Index: llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
===================================================================
--- llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -35,6 +35,11 @@
     return true;
   }
 
+  // If we have a loop invariant computation, we only need to compute it once.
+  // Given that, all but the first occurance are free.
+  if (!IterationNumber->isZero() && SE.isLoopInvariant(S, L))
+    return true;
+
   auto *AR = dyn_cast<SCEVAddRecExpr>(S);
   if (!AR || AR->getLoop() != L)
     return false;
Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -725,13 +725,73 @@
     return MaxPercentThresholdBoost;
 }
 
-// Returns loop size estimation for unrolled loop.
-static uint64_t getUnrolledLoopSize(
-    unsigned LoopSize,
-    TargetTransformInfo::UnrollingPreferences &UP) {
-  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
-  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
-}
+/// Helper class to estimate the size of a loop with a given unroll factor.
+/// Note that there's a more expensive estimator used for full unrolling if the
+/// result of this one doesn't show profit, but this is the only estimate used
+/// for partial and runtime unrolling.
+///
+/// TODO: There are a number of ways to make this more sophisticated.
+/// 1) Extend the uniform logic to understand expressions which are uniform for
+///    a given number of iterations (but not all iterations).  An example:
+///    for (int i = 0; i < N; i++) {
+///      if (a[i/4])
+///        b[i];
+///    }
+/// 2) Recognize patterns which repeat predictably with unrolling.  An example:
+///    for (int i = 0; i < N; i++) {
+///      if (a[i % 4])
+///        b[i];
+///    }
+/// 3) Integrate PeelCount.  If we know we're peeling once, the globally
+///    uniform expressions don't contribute to the loop size of the remaining
+///    (potentially unrollable) loop.
+/// 4) Use aliasing information to reason about CSEable loads.  Consider the
+///    load from 'a' in example #1 with an unroll factor of 4.
+/// 5) Before this gets too complicated, we should consider merging this with
+///    the static unroll cost model.
+class LoopSizeEstimator {
+  Loop &TheLoop;
+  const TargetTransformInfo &TTI;
+  ScalarEvolution &SE;
+  // Note: Both 'size' fields here are in units of TTI->getUserCost(, CodeSize),
+  // not instruction counts.
+  unsigned LoopSize;
+  unsigned UniformSize;
+
+public:
+  LoopSizeEstimator(Loop &L, const TargetTransformInfo &TTI,
+                    ScalarEvolution &SE, unsigned LoopSize)
+    : TheLoop(L), TTI(TTI), SE(SE), LoopSize(LoopSize), UniformSize(-1U) {
+  }
+
+  uint64_t getEstimatedLoopSize(TargetTransformInfo::UnrollingPreferences &UP) {
+    if (-1U == UniformSize)
+      UniformSize = computeUniformInstCost();
+    unsigned LocalUniformSize = UniformSize;
+    // Avoid degenerate case where entire loop body is uniform across all
+    // iterations.  We should break the backedge, not unroll that.
+    if (LocalUniformSize + UP.BEInsns + 1 > LoopSize)
+      LocalUniformSize = 0;
+
+    assert(LoopSize >= (UP.BEInsns + LocalUniformSize) &&
+           "LoopSize should not be less than BEInsns!");
+    return (uint64_t)(LoopSize - UP.BEInsns - LocalUniformSize) * UP.Count +
+      UP.BEInsns + LocalUniformSize;
+  }
+private:
+  unsigned computeUniformInstCost() {
+    unsigned Count = 0;
+    for (BasicBlock *BB : TheLoop.blocks())
+      for (Instruction &I : *BB) {
+        if (!SE.isSCEVable(I.getType()))
+          continue;
+        if (!SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop))
+          continue;
+        Count += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
+      }
+    return Count;
+  }
+};
 
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
@@ -749,6 +809,8 @@
     TargetTransformInfo::UnrollingPreferences &UP,
     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
 
+  LoopSizeEstimator LSE(*L, TTI, SE, LoopSize);
+
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
   bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
@@ -756,7 +818,7 @@
     UP.Count = UnrollCount;
     UP.AllowExpensiveTripCount = true;
     UP.Force = true;
-    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
+    if (UP.AllowRemainder && LSE.getEstimatedLoopSize(UP) < UP.Threshold)
       return true;
   }
 
@@ -768,13 +830,13 @@
     UP.AllowExpensiveTripCount = true;
     UP.Force = true;
     if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
-        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+        LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold)
       return true;
   }
   bool PragmaFullUnroll = hasUnrollFullPragma(L);
   if (PragmaFullUnroll && TripCount != 0) {
     UP.Count = TripCount;
-    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+    if (LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold)
       return false;
   }
 
@@ -823,7 +885,7 @@
   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
     // When computing the unrolled size, note that BEInsns are not replicated
     // like the rest of the loop body.
-    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+    if (LSE.getEstimatedLoopSize(UP) < UP.Threshold) {
       UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
       TripCount = FullUnrollTripCount;
       TripMultiple = UP.UpperBound ? 1 : TripMultiple;
@@ -870,7 +932,7 @@
       UP.Count = TripCount;
     if (UP.PartialThreshold != NoThreshold) {
       // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+      if (LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold)
         UP.Count =
             (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
             (LoopSize - UP.BEInsns);
@@ -885,7 +947,7 @@
         // remainder loop is allowed.
         UP.Count = UP.DefaultUnrollRuntimeCount;
         while (UP.Count != 0 &&
-               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+               LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold)
           UP.Count >>= 1;
       }
       if (UP.Count < 2) {
@@ -969,7 +1031,7 @@
   // Reduce unroll count to be the largest power-of-two factor of
   // the original count which satisfies the threshold limit.
   while (UP.Count != 0 &&
-         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+         LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold)
     UP.Count >>= 1;
 
 #ifndef NDEBUG
Index: llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
===================================================================
--- llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
+++ llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
@@ -116,7 +116,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]]
 ; CHECK-NEXT:    [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4
 ; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_2]])
-; CHECK-NEXT:    [[INC_3]] = add nsw i64 [[INC_2]], 1
+; CHECK-NEXT:    [[INC_3]] = add nuw nsw i64 [[INC_2]], 1
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]]
 ; CHECK:       for.body.3:
 ; CHECK-NEXT:    [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]])
@@ -124,7 +124,7 @@
 ; CHECK:       for.body.for.body_crit_edge.3:
 ; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]]
 ; CHECK-NEXT:    [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4
-; CHECK-NEXT:    br label [[FOR_HEADER]], !llvm.loop !0
+; CHECK-NEXT:    br label [[FOR_HEADER]], [[LOOP0:!llvm.loop !.*]]
 ;
 entry:
   br i1 true, label %for.preheader, label %for.end
@@ -202,7 +202,7 @@
 ; CHECK:       for.body.for.body_crit_edge.3:
 ; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]]
 ; CHECK-NEXT:    [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4
-; CHECK-NEXT:    br label [[FOR_HEADER]], !llvm.loop !2
+; CHECK-NEXT:    br label [[FOR_HEADER]], [[LOOP2:!llvm.loop !.*]]
 ;
 entry:
   %0 = load i32, i32* %A, align 4