Index: llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
===================================================================
--- llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -35,6 +35,11 @@
     return true;
   }
 
+  // If we have a loop invariant computation, we only need to compute it once.
+  // Given that, all but the first occurance are free.
+  if (!IterationNumber->isZero() && SE.isLoopInvariant(S, L))
+    return true;
+
   auto *AR = dyn_cast<SCEVAddRecExpr>(S);
   if (!AR || AR->getLoop() != L)
     return false;
Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -725,13 +725,67 @@
     return MaxPercentThresholdBoost;
 }
 
-// Returns loop size estimation for unrolled loop.
-static uint64_t getUnrolledLoopSize(
-    unsigned LoopSize,
-    TargetTransformInfo::UnrollingPreferences &UP) {
-  assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
-  return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
-}
+/// Helper class to estimate the size of a loop with a given unroll factor.
+/// Note that there's a more expensive estimator used for full unrolling if the
+/// result of this one doesn't show profit, but this is the only estimate used
+/// for partial and runtime unrolling.
+///
+/// TODO: There are a number of ways to make this more sophisticated.
+/// 1) Extend the uniform logic to understand expressions which are uniform for
+///    a given number of iterations (but not all iterations).  An example:
+///    for (int i = 0; i < N; i++) {
+///      if (a[i/4])
+///        b[i++;
+///    }
+/// 2) Recognize patterns which repeat predictably with unrolling.  An example:
+///    for (int i = 0; i < N; i++) {
+///      if (a[i % 4])
+///        b[i++;
+///    }
+/// 3) Integrate PeelCount.  If we know we're peeling once, the globally
+///    uniform expressions don't contribute to the loop size of the remaining
+///    (potentially unrollable) loop.
+/// 4) Use aliasing information to reason about CSEable loads.  Consider the
+///    load from 'a' in example #1 with an unroll factor of 4.
+/// 5) Before this gets too complicated, we should consider merging this with
+///    the static unroll cost model.
+class LoopSizeEstimator {
+  Loop &TheLoop;
+  ScalarEvolution &SE;
+  unsigned LoopSize;
+  unsigned UniformSize;
+
+public:
+  LoopSizeEstimator(Loop &L, ScalarEvolution &SE, unsigned LoopSize)
+    : TheLoop(L), SE(SE), LoopSize(LoopSize) {
+    UniformSize = computeUniformInstCount();
+  }
+
+  uint64_t getEstimatedLoopSize(TargetTransformInfo::UnrollingPreferences &UP) {
+    unsigned LocalUniformSize = UniformSize;
+    // Avoid degenerate case where entire loop body is uniform across all
+    // iterations.  We should break the backedge, not unroll that.
+    if (LocalUniformSize + UP.BEInsns + 1 > LoopSize)
+      LocalUniformSize = 0;
+
+    assert(LoopSize >= (UP.BEInsns + LocalUniformSize) &&
+           "LoopSize should not be less than BEInsns!");
+    return (uint64_t)(LoopSize - UP.BEInsns - LocalUniformSize) * UP.Count +
+      UP.BEInsns + LocalUniformSize;
+  }
+private:
+  unsigned computeUniformInstCount() {
+    unsigned Count = 0;
+    for (BasicBlock *BB : TheLoop.blocks())
+      for (Instruction &I : *BB) {
+        if (!SE.isSCEVable(I.getType()))
+          continue;
+        if (SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop))
+          Count++;
+      }
+    return Count;
+  }
+};
 
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
@@ -749,6 +803,8 @@
     TargetTransformInfo::UnrollingPreferences &UP,
     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
 
+  LoopSizeEstimator LSE(*L, SE, LoopSize);
+
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
   bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
@@ -756,7 +812,7 @@
     UP.Count = UnrollCount;
     UP.AllowExpensiveTripCount = true;
     UP.Force = true;
-    if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
+    if (UP.AllowRemainder && LSE.getEstimatedLoopSize(UP) < UP.Threshold)
       return true;
   }
 
@@ -768,13 +824,13 @@
     UP.AllowExpensiveTripCount = true;
     UP.Force = true;
     if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
-        getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+        LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold)
       return true;
   }
   bool PragmaFullUnroll = hasUnrollFullPragma(L);
   if (PragmaFullUnroll && TripCount != 0) {
     UP.Count = TripCount;
-    if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+    if (LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold)
       return false;
   }
 
@@ -823,7 +879,7 @@
   if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
     // When computing the unrolled size, note that BEInsns are not replicated
     // like the rest of the loop body.
-    if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+    if (LSE.getEstimatedLoopSize(UP) < UP.Threshold) {
       UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
       TripCount = FullUnrollTripCount;
       TripMultiple = UP.UpperBound ? 1 : TripMultiple;
@@ -870,7 +926,7 @@
       UP.Count = TripCount;
     if (UP.PartialThreshold != NoThreshold) {
       // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+      if (LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold)
         UP.Count =
             (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
             (LoopSize - UP.BEInsns);
@@ -885,7 +941,7 @@
         // remainder loop is allowed.
         UP.Count = UP.DefaultUnrollRuntimeCount;
         while (UP.Count != 0 &&
-               getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+               LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold)
           UP.Count >>= 1;
       }
       if (UP.Count < 2) {
@@ -969,7 +1025,7 @@
   // Reduce unroll count to be the largest power-of-two factor of
   // the original count which satisfies the threshold limit.
   while (UP.Count != 0 &&
-         getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+         LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold)
     UP.Count >>= 1;
 
 #ifndef NDEBUG
Index: llvm/test/Transforms/LoopUnroll/X86/partial-uniform.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopUnroll/X86/partial-uniform.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -unroll-partial-threshold=36 -unroll-threshold=0 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=0 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; Two instructions are loop invariant and thus don't count against
+;; the unroll cost for iteration > 1.  As a result, we can unroll this
+;; at cost 36 not 42.
+define i32 @uniform_contents(i32* %A, i32* %B, i32 %c, i32 %d) {
+; CHECK-LABEL: @uniform_contents(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_3:%.*]], [[LATCH_3:%.*]] ]
+; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT_3:%.*]], [[LATCH_3]] ]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[IV]]
+; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[GEP1]], align 4
+; CHECK-NEXT:    [[EARLYEXIT:%.*]] = icmp eq i32 [[V1]], 0
+; CHECK-NEXT:    br i1 [[EARLYEXIT]], label [[LOOP_EXIT:%.*]], label [[LATCH:%.*]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[INDEX:%.*]] = udiv i32 [[C:%.*]], [[D:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[GEP2]], align 4
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[ACCUM_NEXT:%.*]] = add i32 [[ACCUM]], [[V2]]
+; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT]]
+; CHECK-NEXT:    [[V1_1:%.*]] = load i32, i32* [[GEP1_1]], align 4
+; CHECK-NEXT:    [[EARLYEXIT_1:%.*]] = icmp eq i32 [[V1_1]], 0
+; CHECK-NEXT:    br i1 [[EARLYEXIT_1]], label [[LOOP_EXIT]], label [[LATCH_1:%.*]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[ACCUM_LCSSA:%.*]] = phi i32 [ [[ACCUM]], [[LOOP]] ], [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[ACCUM_NEXT_1:%.*]], [[LATCH_1]] ], [ [[ACCUM_NEXT_1]], [[LATCH_2:%.*]] ], [ [[ACCUM_NEXT_2:%.*]], [[LOOP_3:%.*]] ]
+; CHECK-NEXT:    ret i32 [[ACCUM_LCSSA]]
+; CHECK:       latch.1:
+; CHECK-NEXT:    [[INDEX_1:%.*]] = udiv i32 [[C]], [[D]]
+; CHECK-NEXT:    [[GEP2_1:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_1]]
+; CHECK-NEXT:    [[V2_1:%.*]] = load i32, i32* [[GEP2_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_NEXT]], 1
+; CHECK-NEXT:    [[ACCUM_NEXT_1]] = add i32 [[ACCUM_NEXT]], [[V2_1]]
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[V1_2:%.*]] = load i32, i32* [[GEP1_2]], align 4
+; CHECK-NEXT:    [[EARLYEXIT_2:%.*]] = icmp eq i32 [[V1_2]], 0
+; CHECK-NEXT:    br i1 [[EARLYEXIT_2]], label [[LOOP_EXIT]], label [[LATCH_2]]
+; CHECK:       latch.2:
+; CHECK-NEXT:    [[INDEX_2:%.*]] = udiv i32 [[C]], [[D]]
+; CHECK-NEXT:    [[GEP2_2:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_2]]
+; CHECK-NEXT:    [[V2_2:%.*]] = load i32, i32* [[GEP2_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_NEXT_1]], 1
+; CHECK-NEXT:    [[ACCUM_NEXT_2]] = add i32 [[ACCUM_NEXT_1]], [[V2_2]]
+; CHECK-NEXT:    [[EXIT_2:%.*]] = icmp eq i64 [[IV_NEXT_1]], 198
+; CHECK-NEXT:    br i1 [[EXIT_2]], label [[LOOP_EXIT]], label [[LOOP_3]]
+; CHECK:       loop.3:
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[V1_3:%.*]] = load i32, i32* [[GEP1_3]], align 4
+; CHECK-NEXT:    [[EARLYEXIT_3:%.*]] = icmp eq i32 [[V1_3]], 0
+; CHECK-NEXT:    br i1 [[EARLYEXIT_3]], label [[LOOP_EXIT]], label [[LATCH_3]]
+; CHECK:       latch.3:
+; CHECK-NEXT:    [[INDEX_3:%.*]] = udiv i32 [[C]], [[D]]
+; CHECK-NEXT:    [[GEP2_3:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_3]]
+; CHECK-NEXT:    [[V2_3:%.*]] = load i32, i32* [[GEP2_3]], align 4
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV_NEXT_2]], 1
+; CHECK-NEXT:    [[ACCUM_NEXT_3]] = add i32 [[ACCUM_NEXT_2]], [[V2_3]]
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ]
+  %gep1 = getelementptr i32, i32* %A, i64 %iv
+  %v1 = load i32, i32* %gep1
+  %earlyexit = icmp eq i32 %v1, 0
+  br i1 %earlyexit, label %loop_exit, label %latch
+latch:
+  %index = udiv i32 %c, %d
+  %gep2 = getelementptr i32, i32* %A, i32 %index
+  %v2 = load i32, i32* %gep2
+  %iv.next = add i64 %iv, 1
+  %accum.next = add i32 %accum, %v2
+  %exit = icmp eq i64 %iv, 198
+  br i1 %exit, label %loop_exit, label %loop
+
+loop_exit:
+  ret i32 %accum
+}
+
Index: llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
===================================================================
--- llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
+++ llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll
@@ -116,7 +116,7 @@
 ; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]]
 ; CHECK-NEXT:    [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4
 ; CHECK-NEXT:    call void @bar(i32 [[DOTPRE_2]])
-; CHECK-NEXT:    [[INC_3]] = add nsw i64 [[INC_2]], 1
+; CHECK-NEXT:    [[INC_3]] = add nuw nsw i64 [[INC_2]], 1
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]]
 ; CHECK:       for.body.3:
 ; CHECK-NEXT:    [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]])
@@ -124,7 +124,7 @@
 ; CHECK:       for.body.for.body_crit_edge.3:
 ; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]]
 ; CHECK-NEXT:    [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4
-; CHECK-NEXT:    br label [[FOR_HEADER]], !llvm.loop !0
+; CHECK-NEXT:    br label [[FOR_HEADER]], [[LOOP0:!llvm.loop !.*]]
 ;
 entry:
   br i1 true, label %for.preheader, label %for.end
@@ -202,7 +202,7 @@
 ; CHECK:       for.body.for.body_crit_edge.3:
 ; CHECK-NEXT:    [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]]
 ; CHECK-NEXT:    [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4
-; CHECK-NEXT:    br label [[FOR_HEADER]], !llvm.loop !2
+; CHECK-NEXT:    br label [[FOR_HEADER]], [[LOOP2:!llvm.loop !.*]]
 ;
 entry:
   %0 = load i32, i32* %A, align 4