Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -264,11 +264,11 @@
   InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                       DominatorTree *DT, const TargetLibraryInfo *TLI,
                       const TargetTransformInfo *TTI, unsigned VecWidth,
-                      unsigned UnrollFactor)
+                      unsigned UnrollFactor, unsigned MVSize)
       : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
-        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()),
-        Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor),
-        Legal(nullptr), AddedSafetyChecks(false) {}
+        VF(VecWidth), MaxVectorSize(MVSize), UF(UnrollFactor),
+        Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr),
+        WidenMap(UnrollFactor), Legal(nullptr), AddedSafetyChecks(false) {}
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *L) {
@@ -433,6 +433,9 @@
   /// vector elements.
   unsigned VF;
 
+  /// The max possible Vector Factor.
+  unsigned MaxVectorSize;
+
 protected:
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
@@ -479,7 +482,8 @@
   InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
                     DominatorTree *DT, const TargetLibraryInfo *TLI,
                     const TargetTransformInfo *TTI, unsigned UnrollFactor)
-      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor) {}
+      : InnerLoopVectorizer(OrigLoop, SE, LI, DT, TLI, TTI, 1, UnrollFactor,
+                            0) {}
 
 private:
   void scalarizeInstruction(Instruction *Instr,
@@ -1088,7 +1092,7 @@
                              const TargetLibraryInfo *TLI, AssumptionCache *AC,
                              const Function *F, const LoopVectorizeHints *Hints)
       : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
-        TheFunction(F), Hints(Hints) {
+        TheFunction(F), Hints(Hints), MaxVectorSize(0) {
     CodeMetrics::collectEphemeralValues(L, AC, EphValues);
   }
 
@@ -1108,6 +1112,9 @@
   /// 64 bit loop indices.
   unsigned getWidestType();
 
+  /// \return The maximum possible vector factor.
+  unsigned getMaxVectorSize() { return MaxVectorSize; }
+
   /// \return The desired interleave count.
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
@@ -1177,6 +1184,8 @@
   const Function *TheFunction;
   // Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
+  // The max possible Vector Factor.
+  unsigned MaxVectorSize;
 };
 
 /// Utility class for getting and setting loop vectorizer hints in the form
@@ -1675,7 +1684,8 @@
       Unroller.vectorize(&LVL);
     } else {
       // If we decided that it is *legal* to vectorize the loop then do it.
-      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
+      InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC,
+                             CM.getMaxVectorSize());
       LB.vectorize(&LVL);
       ++LoopsVectorized;
 
@@ -2461,7 +2471,7 @@
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-       [ ] <-- Back-edge taken count overflow check.
+       [ ] <-- loop iteration number check.
     /   |
    /    v
   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
@@ -2525,21 +2535,18 @@
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, DL, "induction");
 
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  Value *BackedgeCount =
-      Exp.expandCodeFor(BackedgeTakeCount, BackedgeTakeCount->getType(),
-                        VectorPH->getTerminator());
-  if (BackedgeCount->getType()->isPointerTy())
-    BackedgeCount = CastInst::CreatePointerCast(BackedgeCount, IdxTy,
-                                                "backedge.ptrcnt.to.int",
-                                                VectorPH->getTerminator());
-  Instruction *CheckBCOverflow =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, BackedgeCount,
-                      Constant::getAllOnesValue(BackedgeCount->getType()),
-                      "backedge.overflow", VectorPH->getTerminator());
+  // We test the iteration number and if it is less than MaxVectorSize we want
+  // to directly jump to the scalar remainder loop.
+  Value *ExitCountValue = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                            VectorPH->getTerminator());
+  if (ExitCountValue->getType()->isPointerTy())
+    ExitCountValue = CastInst::CreatePointerCast(ExitCountValue, IdxTy,
+                                                 "exitcount.ptrcnt.to.int",
+                                                 VectorPH->getTerminator());
+  Instruction *CheckMinIters =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, ExitCountValue,
+                      ConstantInt::get(ExitCount->getType(), MaxVectorSize),
+                      "min.iters.check", VectorPH->getTerminator());
 
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
@@ -2594,12 +2601,11 @@
   // Generate code to check that the loop's trip count that we computed by
   // adding one to the backedge-taken count will not overflow.
   BasicBlock *NewVectorPH =
-      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "overflow.checked");
+      VectorPH->splitBasicBlock(VectorPH->getTerminator(), "min.iters.checked");
   if (ParentLoop)
     ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI);
-  ReplaceInstWithInst(
-      VectorPH->getTerminator(),
-      BranchInst::Create(ScalarPH, NewVectorPH, CheckBCOverflow));
+  ReplaceInstWithInst(VectorPH->getTerminator(),
+                      BranchInst::Create(ScalarPH, NewVectorPH, CheckMinIters));
   VectorPH = NewVectorPH;
 
   // This is the IR builder that we use to add all of the logic for bypassing
@@ -4477,7 +4483,7 @@
     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                     WidestRegister : MaxSafeDepDist);
-  unsigned MaxVectorSize = WidestRegister / WidestType;
+  MaxVectorSize = WidestRegister / WidestType;
   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
   DEBUG(dbgs() << "LV: The Widest register is: "
           << WidestRegister << " bits.\n");
Index: test/Transforms/LoopVectorize/induction.ll
===================================================================
--- test/Transforms/LoopVectorize/induction.ll
+++ test/Transforms/LoopVectorize/induction.ll
@@ -113,8 +113,8 @@
 ; condition and branch directly to the scalar loop.
 
 ; CHECK-LABEL: max_i32_backedgetaken
-; CHECK:  %backedge.overflow = icmp eq i32 -1, -1
-; CHECK:  br i1 %backedge.overflow, label %scalar.ph, label %overflow.checked
+; CHECK:  %min.iters.check = icmp ult i32 0, 1
+; CHECK:  br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
 
 ; CHECK: scalar.ph:
 ; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
Index: test/Transforms/LoopVectorize/miniters.ll
===================================================================
--- test/Transforms/LoopVectorize/miniters.ll
+++ test/Transforms/LoopVectorize/miniters.ll
@@ -0,0 +1,41 @@
+; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [1000 x i32] zeroinitializer, align 16
+@c = common global [1000 x i32] zeroinitializer, align 16
+@a = common global [1000 x i32] zeroinitializer, align 16
+
+; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than MaxVectorSize.
+; CHECK-LABEL: foo(
+; CHECK: %min.iters.check = icmp ult i64 %N, 4
+; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked
+
+define void @foo(i64 %N) {
+entry:
+  %cmp.8 = icmp sgt i64 %N, 0
+  br i1 %cmp.8, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %i.09 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %i.09
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds [1000 x i32], [1000 x i32]* @c, i64 0, i64 %i.09
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i64 %i.09, 1
+  %exitcond = icmp eq i64 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}