Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -918,6 +918,10 @@
                                 unsigned ChainSizeInBytes,
                                 VectorType *VecTy) const;
 
+  /// \returns The minimum vector factor for vectors of elements of type
+  /// \p ElementTy.
+  unsigned getMinimumVF(Type *ElementTy) const;
+
   /// Flags describing the kind of vector reduction.
   struct ReductionFlags {
     ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
@@ -1124,6 +1128,7 @@
   virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
+  virtual unsigned getMinimumVF(Type *ElementTy) const = 0;
   virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                      ReductionFlags) const = 0;
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
@@ -1501,6 +1506,9 @@
                                 VectorType *VecTy) const override {
     return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
   }
+  unsigned getMinimumVF(Type *ElementTy) const override {
+    return Impl.getMinimumVF(ElementTy);
+  }
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              ReductionFlags Flags) const override {
     return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -541,6 +541,10 @@
     return VF;
   }
 
+  unsigned getMinimumVF(Type *ElementTy) const {
+    return 2;
+  }
+
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const {
     return false;
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -593,6 +593,10 @@
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+unsigned TargetTransformInfo::getMinimumVF(Type *ElementTy) const {
+  return TTIImpl->getMinimumVF(ElementTy);
+}
+
 bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
                                                 Type *Ty, ReductionFlags Flags) const {
   return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1787,15 +1787,15 @@
       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
 
-  /// \return An upper bound for the vectorization factor, or None if
-  /// vectorization should be avoided up front.
-  Optional<unsigned> computeMaxVF(bool OptForSize);
+  /// \return A lower and an upper bounds for the vectorization factor,
+  /// or None if vectorization should be avoided up front.
+  Optional<std::pair<unsigned,unsigned>> computeMaxVF(bool OptForSize);
 
   /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
-  /// then this vectorization factor will be selected if vectorization is
-  /// possible.
-  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+  /// This method checks every power of two from MinVF up to MaxVF. If UserVF
+  /// is not ZERO then this vectorization factor will be selected if
+  /// vectorization is possible.
+  VectorizationFactor selectVectorizationFactor(unsigned MinVF, unsigned MaxVF);
 
   /// Setup cost-based decisions for user vectorization factor.
   void selectUserVectorizationFactor(unsigned UserVF) {
@@ -1806,7 +1806,7 @@
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
-  std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+  SmallVector<Type*,4> getSmallestAndWidestTypes();
 
   /// \return The desired interleave count.
   /// If interleave count has been specified by metadata it will be returned.
@@ -2028,9 +2028,11 @@
 private:
   unsigned NumPredStores = 0;
 
-  /// \return An upper bound for the vectorization factor, larger than zero.
-  /// One is returned if vectorization should best be avoided due to cost.
-  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
+  /// \return A lower and an upper bound for the vectorization factor,
+  /// larger than zero. One is returned if vectorization should best be
+  /// avoided due to cost.
+  std::pair<unsigned,unsigned> computeFeasibleMaxVF(bool OptForSize,
+                                                    unsigned ConstTripCount);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
@@ -6017,7 +6019,8 @@
   }
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+Optional<std::pair<unsigned,unsigned>>
+LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
     // TODO: It may by useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
@@ -6056,9 +6059,9 @@
     return None;
   }
 
-  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
+  std::pair<unsigned,unsigned> Range = computeFeasibleMaxVF(OptForSize, TC);
 
-  if (TC % MaxVF != 0) {
+  if (TC % Range.first != 0 || TC % Range.second != 0) {
     // If the trip count that we found modulo the vectorization factor is not
     // zero then we require a tail.
     // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
@@ -6074,15 +6077,19 @@
     return None;
   }
 
-  return MaxVF;
+  return Range;
 }
 
-unsigned
+std::pair<unsigned,unsigned>
 LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
                                                  unsigned ConstTripCount) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
-  unsigned SmallestType, WidestType;
-  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+  SmallVector<Type*,4> ElemTypes = getSmallestAndWidestTypes();
+  if (ElemTypes.empty())
+    return { 1, 1 };
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+  unsigned SmallestType = DL.getTypeSizeInBits(ElemTypes.front());
+  unsigned WidestType = std::max(8ul, DL.getTypeSizeInBits(ElemTypes.back()));
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
 
   // Get the maximum safe dependence distance in bits computed by LAA.
@@ -6093,7 +6100,11 @@
 
   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
 
-  unsigned MaxVectorSize = WidestRegister / WidestType;
+  unsigned MinVF = 1;
+  for (Type *Ty : ElemTypes)
+    MinVF = std::max(MinVF, TTI.getMinimumVF(Ty));
+
+  unsigned MaxVectorSize = std::max(MinVF, WidestRegister / WidestType);
 
   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
                << WidestType << " bits.\n");
@@ -6105,7 +6116,7 @@
   if (MaxVectorSize == 0) {
     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     MaxVectorSize = 1;
-    return MaxVectorSize;
+    return { MaxVectorSize, MaxVectorSize };
   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
              isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
@@ -6113,7 +6124,7 @@
     DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
                  << ConstTripCount << "\n");
     MaxVectorSize = ConstTripCount;
-    return MaxVectorSize;
+    return { MaxVectorSize, MaxVectorSize };
   }
 
   unsigned MaxVF = MaxVectorSize;
@@ -6138,11 +6149,12 @@
       }
     }
   }
-  return MaxVF;
+  return { MinVF, MaxVF };
 }
 
 VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MinVF,
+                                                      unsigned MaxVF) {
   float Cost = expectedCost(1).first;
   const float ScalarCost = Cost;
   unsigned Width = 1;
@@ -6151,11 +6163,11 @@
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   // Ignore scalar width, because the user explicitly wants vectorization.
   if (ForceVectorization && MaxVF > 1) {
-    Width = 2;
+    Width = MinVF;
     Cost = expectedCost(Width).first / (float)Width;
   }
 
-  for (unsigned i = 2; i <= MaxVF; i *= 2) {
+  for (unsigned i = MinVF; i <= MaxVF; i *= 2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
@@ -6191,10 +6203,10 @@
   return Factor;
 }
 
-std::pair<unsigned, unsigned>
+SmallVector<Type*,4>
 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
-  unsigned MinWidth = -1U;
-  unsigned MaxWidth = 8;
+  SmallVector<Type*,4> SortedTypes;
+  SetVector<Type*> ElemTypes;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
 
   // For each block.
@@ -6237,14 +6249,16 @@
           !Legal->isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
         continue;
 
-      MinWidth = std::min(MinWidth,
-                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
-      MaxWidth = std::max(MaxWidth,
-                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+      ElemTypes.insert(T->getScalarType());
     }
   }
 
-  return {MinWidth, MaxWidth};
+  SortedTypes.assign(ElemTypes.begin(), ElemTypes.end());
+  llvm::sort(SortedTypes.begin(), SortedTypes.end(),
+             [&DL] (Type *T0, Type *T1) {
+               return DL.getTypeSizeInBits(T0) < DL.getTypeSizeInBits(T1);
+             });
+  return SortedTypes;
 }
 
 unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
@@ -7396,7 +7410,8 @@
 LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
   // Width 1 means no vectorize, cost 0 means uncomputed cost.
   const VectorizationFactor NoVectorization = {1U, 0U};
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  Optional<std::pair<unsigned,unsigned>> MaybeMaxVF =
+     CM.computeMaxVF(OptForSize);
   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
     return NoVectorization;
 
@@ -7411,26 +7426,32 @@
     return {UserVF, 0};
   }
 
-  unsigned MaxVF = MaybeMaxVF.getValue();
-  assert(MaxVF != 0 && "MaxVF is zero.");
+  unsigned MinVF, MaxVF;
+  std::tie(MinVF, MaxVF) = MaybeMaxVF.getValue();
+  assert(MinVF != 0 && MaxVF != 0 && "MinVF or MaxVF is zero.");
+
+  // Collect Uniform and Scalar instructions.
+  if (MinVF > 1)
+    CM.collectUniformsAndScalars(1);
 
-  for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+  for (unsigned VF = MinVF; VF <= MaxVF; VF *= 2) {
     // Collect Uniform and Scalar instructions after vectorization with VF.
     CM.collectUniformsAndScalars(VF);
 
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    if (VF > 1)
-      CM.collectInstsToScalarize(VF);
+    CM.collectInstsToScalarize(VF);
   }
 
-  buildVPlans(1, MaxVF);
+  if (MinVF > 1)
+    buildVPlans(1, 1);
+  buildVPlans(MinVF, MaxVF);
   DEBUG(printPlans(dbgs()));
   if (MaxVF == 1)
     return NoVectorization;
 
   // Select the optimal vectorization factor.
-  return CM.selectVectorizationFactor(MaxVF);
+  return CM.selectVectorizationFactor(MinVF, MaxVF);
 }
 
 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
Index: test/Transforms/LoopUnroll/runtime-loop4.ll
===================================================================
--- test/Transforms/LoopUnroll/runtime-loop4.ll
+++ test/Transforms/LoopUnroll/runtime-loop4.ll
@@ -37,7 +37,7 @@
   store i32 %iv2, i32* %offset2, align 4
   %inc2 = add i32 %iv2, 1
   %exitcnd2 = icmp uge i32 %inc2, %iter
-  br i1 %exitcnd2, label %exit2, label %loop2
+  br i1 %exitcnd2, label %exit2, label %loop2, !llvm.loop !0
 
 exit2:
   br label %loop1.latch
@@ -45,8 +45,11 @@
 loop1.latch:
   %inc1 = add i32 %iv1, 1
   %exitcnd1 = icmp uge i32 %inc1, 1024
-  br i1 %exitcnd1, label %exit, label %loop1
+  br i1 %exitcnd1, label %exit, label %loop1, !llvm.loop !0
 
 exit:
   ret void
 }
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 0}
Index: test/Transforms/LoopVectorize/X86/funclet.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/funclet.ll
+++ test/Transforms/LoopVectorize/X86/funclet.ll
@@ -22,7 +22,7 @@
   %call = call double @floor(double 1.0) #1 [ "funclet"(token %1) ]
   %inc = add nuw nsw i32 %i.07, 1
   %exitcond = icmp eq i32 %inc, 1024
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 
 try.cont:                                         ; preds = %for.cond.cleanup
   ret void
@@ -43,3 +43,6 @@
 
 attributes #0 = { "target-features"="+sse2" }
 attributes #1 = { nounwind readnone }
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 16}
Index: test/Transforms/LoopVectorize/pr32859.ll
===================================================================
--- test/Transforms/LoopVectorize/pr32859.ll
+++ test/Transforms/LoopVectorize/pr32859.ll
@@ -22,9 +22,12 @@
 if.end.2.i:                                       ; preds = %for.cond1.preheader.i
   %inc5.i = add nsw i32 %c.06.i, 1
   %cmp.i = icmp slt i32 %inc5.i, 16
-  br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader
+  br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader, !llvm.loop !0
 
 for.cond.preheader:                               ; preds = %if.end.2.i
   %e.0.ph = phi i32 [ 0, %if.end.2.i ]
   unreachable
 }
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
Index: test/Transforms/LoopVectorize/unroll.ll
===================================================================
--- test/Transforms/LoopVectorize/unroll.ll
+++ test/Transforms/LoopVectorize/unroll.ll
@@ -27,7 +27,7 @@
   store i32 3, i32* %arrayidx, align 4
   %inc = add nuw nsw i64 %i.06, 1
   %cmp = icmp slt i64 %inc, %conv
-  br i1 %cmp, label %for.body, label %for.end.loopexit
+  br i1 %cmp, label %for.body, label %for.end.loopexit, !llvm.loop !0
 
 for.end.loopexit:                                 ; preds = %for.body
   br label %for.end
@@ -35,3 +35,6 @@
 for.end:                                          ; preds = %for.end.loopexit, %entry
   ret void
 }
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 1}