Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1852,7 +1852,7 @@
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization should be avoided up front.
-  Optional<unsigned> computeMaxVF(bool OptForSize);
+  Optional<unsigned> computeMaxVF(bool OptForSize, bool OptForDivergent);
 
   /// Information about vectorization costs
   struct VectorizationFactor {
@@ -2218,6 +2218,7 @@
 
   /// Plan how to best vectorize, return the best VF and its cost.
   LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
+                                                       bool OptForDivergent,
                                                        unsigned UserVF);
 
   /// Generate the IR code for the vectorized loop.
@@ -6270,7 +6271,8 @@
   }
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize,
+                                                            bool OptForDivergent) {
   if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
     ORE->emit(createMissedAnalysis("ConditionalStore")
               << "store that is conditionally executed prevents vectorization");
@@ -6278,29 +6280,29 @@
     return None;
   }
 
-  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
-    // uniform if the target can skip.
-    DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+  if (Legal->getRuntimePointerChecking()->Need) {
+    if (OptForSize) {
+      ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+                << "runtime pointer checks needed. Enable vectorization of this "
+                "loop with '#pragma clang loop vectorize(enable)' when "
+                "compiling with -Os/-Oz");
+      DEBUG(dbgs()
+            << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
+
+      return None;
+    }
 
-    ORE->emit(
-      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
-      << "runtime pointer checks needed. Not enabled for divergent target");
+    if (OptForDivergent) {
+      // TODO: It may by useful to do since it's still likely to be dynamically
+      // uniform if the target can skip.
+      DEBUG(dbgs() << "LV: Not inserting runtime ptr check for divergent target");
 
-    return None;
-  }
-
-  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
-    return computeFeasibleMaxVF(OptForSize);
+      ORE->emit(
+        createMissedAnalysis("CantVersionLoopWithDivergentTarget")
+        << "runtime pointer checks needed. Not enabled for divergent target");
 
-  if (Legal->getRuntimePointerChecking()->Need) {
-    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
-              << "runtime pointer checks needed. Enable vectorization of this "
-                 "loop with '#pragma clang loop vectorize(enable)' when "
-                 "compiling with -Os/-Oz");
-    DEBUG(dbgs()
-          << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return None;
+      return None;
+    }
   }
 
   // If we optimize the program for size, avoid creating the tail loop.
@@ -6308,17 +6310,17 @@
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   // If we don't know the precise trip count, don't try to vectorize.
-  if (TC < 2) {
+  if (TC < 2 && (OptForSize || OptForDivergent)) {
     ORE->emit(
-        createMissedAnalysis("UnknownLoopCountComplexCFG")
-        << "unable to calculate the loop count due to complex control flow");
-    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+      createMissedAnalysis("UnknownLoopCountComplexCFG")
+      << "unable to calculate the loop count due to complex control flow");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with "
+          << (OptForSize ? "-Os/-Oz.\n" : "divergent target.\n"));
     return None;
   }
 
   unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
-
-  if (TC % MaxVF != 0) {
+  if (OptForSize && TC % MaxVF != 0) {
     // If the trip count that we found modulo the vectorization factor is not
     // zero then we require a tail.
     // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
@@ -6327,9 +6329,9 @@
 
     ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
               << "cannot optimize for size and vectorize at the "
-                 "same time. Enable vectorization of this loop "
-                 "with '#pragma clang loop vectorize(enable)' "
-                 "when compiling with -Os/-Oz");
+              "same time. Enable vectorization of this loop "
+              "with '#pragma clang loop vectorize(enable)' "
+              "when compiling with -Os/-Oz");
     DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
     return None;
   }
@@ -7609,12 +7611,13 @@
 }
 
 LoopVectorizationCostModel::VectorizationFactor
-LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
+LoopVectorizationPlanner::plan(bool OptForSize, bool OptForDivergent,
+                               unsigned UserVF) {
 
   // Width 1 means no vectorize, cost 0 means uncomputed cost.
   const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
                                                                            0U};
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize, OptForDivergent);
   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
     return NoVectorization;
 
@@ -7829,6 +7832,10 @@
   bool OptForSize =
       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
 
+  bool OptForDivergent =
+    Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+    TTI->hasBranchDivergence();
+
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
   unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
@@ -7898,7 +7905,7 @@
 
   // Plan how to best vectorize, return the best VF and its cost.
   LoopVectorizationCostModel::VectorizationFactor VF =
-      LVP.plan(OptForSize, UserVF);
+    LVP.plan(OptForSize, OptForDivergent, UserVF);
 
   // Select the interleave count.
   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
Index: test/Transforms/LoopVectorize/AMDGPU/divergent-loop-bounds.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AMDGPU/divergent-loop-bounds.ll
@@ -0,0 +1,183 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -pass-remarks-analysis='loop-vectorize' < %s 2>&1 | FileCheck -check-prefixes=GFX9-REMARK %s
+
+; It may make sense to vectorize this if the condition is uniform, but
+; assume that it isn't for now.
+
+; GCN-LABEL: @small_loop_i16_unknown_uniform_size(
+; GCN: load i16
+; GCN: add nsw i16
+; GCN: store i16
+; GCN: br i1 %cond
+
+; GFX9-REMARK: remark: <unknown>:0:0: loop not vectorized: unable to calculate the loop count due to complex control flow
+define amdgpu_kernel void @small_loop_i16_unknown_uniform_size(i16 addrspace(1)* nocapture %inArray, i16 %size) #0 {
+entry:
+  %cmp = icmp sgt i16 %size, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv
+  %load = load i16, i16 addrspace(1)* %gep, align 2
+  %add = add nsw i16 %load, 6
+  store i16 %add, i16 addrspace(1)* %gep, align 2
+  %iv1 = add i16 %iv, 1
+  %cond = icmp eq i16 %iv1, %size
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
+
+; GCN-LABEL: @small_loop_i16_unknown_divergent_size(
+; GCN: load i16
+; GCN: add nsw i16
+; GCN: store i16
+; GCN: br i1 %cond
+
+; GFX9-REMARK: remark: <unknown>:0:0: loop not vectorized: unable to calculate the loop count due to complex control flow
+define amdgpu_kernel void @small_loop_i16_unknown_divergent_size(i16 addrspace(1)* nocapture %inArray, i16 addrspace(1)* %size.ptr) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %size.gep = getelementptr inbounds i16, i16 addrspace(1)* %size.ptr, i32 %tid
+  %size = load i16, i16 addrspace(1)* %size.gep
+  %cmp = icmp sgt i16 %size, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv
+  %load = load i16, i16 addrspace(1)* %gep, align 2
+  %add = add nsw i16 %load, 6
+  store i16 %add, i16 addrspace(1)* %gep, align 2
+  %iv1 = add i16 %iv, 1
+  %cond = icmp eq i16 %iv1, %size
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
+
+; This loop will be vectorized as the trip count is below the
+; threshold and no scalar iterations are needed.
+
+; GCN-LABEL: @small_loop_i16_256(
+; GFX9: load <2 x i16>
+; GFX9: add nsw <2 x i16>
+; GFX9: store <2 x i16>
+; GFX9: add i32 %index, 2
+; GFX9: br i1
+
+; VI-NOT: <2 x i16>
+define amdgpu_kernel void @small_loop_i16_256(i16 addrspace(1)* nocapture %inArray) #0 {
+entry:
+  br label %loop
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv
+  %load = load i16, i16 addrspace(1)* %gep, align 2
+  %add = add nsw i16 %load, 6
+  store i16 %add, i16 addrspace(1)* %gep, align 2
+  %iv1 = add i16 %iv, 1
+  %cond = icmp eq i16 %iv1, 127
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
+
+; Not divisible by vectorize factor of 2
+; GCN-LABEL: @small_loop_i16_255(
+; GFX9: load <2 x i16>
+; GFX9: add nsw <2 x i16>
+; GFX9: store <2 x i16>
+; GFX9: add i32 %index, 2
+; GFX9: br i1
+
+; VI-NOT: <2 x i16>
+define amdgpu_kernel void @small_loop_i16_255(i16 addrspace(1)* nocapture %inArray) #0 {
+entry:
+  br label %loop
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv
+  %load = load i16, i16 addrspace(1)* %gep, align 2
+  %add = add nsw i16 %load, 6
+  store i16 %add, i16 addrspace(1)* %gep, align 2
+  %iv1 = add i16 %iv, 1
+  %cond = icmp eq i16 %iv1, 127
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
+
+; Metadata indicates it should be vectorized even though it may be
+; divergent.
+; GCN-LABEL: @small_loop_i16_unknown_uniform_size_forced(
+; GCN: load <2 x i16>
+; GCN: add nsw <2 x i16>
+; GCN: store <2 x i16>
+; GCN: add i32 %index, 2
+; GCN: br i1
+define amdgpu_kernel void @small_loop_i16_unknown_uniform_size_forced(i16 addrspace(1)* nocapture %inArray, i32 %size) #0 {
+entry:
+  %cmp = icmp sgt i32 %size, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:                                             ; preds = %loop, %entry
+  %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i32 %iv
+  %load = load i16, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2
+  %add = add nsw i16 %load, 6
+  store i16 %add, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2
+  %iv1 = add i32 %iv, 1
+  %cond = icmp eq i32 %iv1, %size
+  br i1 %cond, label %exit, label %loop, !llvm.loop !2
+
+exit:                                             ; preds = %loop, %entry
+  ret void
+}
+
+; GCN-LABEL: @small_loop_i16_unknown_divergent_size_forced(
+; GCN: load <2 x i16>
+; GCN: add nsw <2 x i16>
+; GCN: store <2 x i16>
+; GCN: add i32 %index, 2
+; GCN: br i1
+define amdgpu_kernel void @small_loop_i16_unknown_divergent_size_forced(i16 addrspace(1)* nocapture %inArray, i16 addrspace(1)* %size.ptr) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %size.gep = getelementptr inbounds i16, i16 addrspace(1)* %size.ptr, i32 %tid
+  %size = load i16, i16 addrspace(1)* %size.gep
+  %cmp = icmp sgt i16 %size, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:                                             ; preds = %loop, %entry
+  %iv = phi i16 [ %iv1, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i16, i16 addrspace(1)* %inArray, i16 %iv
+  %load = load i16, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2
+  %add = add nsw i16 %load, 6
+  store i16 %add, i16 addrspace(1)* %gep, align 2, !llvm.mem.parallel_loop_access !2
+  %iv1 = add i16 %iv, 1
+  %cond = icmp eq i16 %iv1, %size
+  br i1 %cond, label %exit, label %loop, !llvm.loop !2
+
+exit:                                             ; preds = %loop, %entry
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
+
+!0 = distinct !{!0}
+!1 = distinct !{!1}
+!2 = distinct !{!2, !3, !4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
\ No newline at end of file