Index: include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -241,6 +241,10 @@
   /// If false, good old LV code.
   bool canVectorize(bool UseVPlanNativePath);
 
+  /// Return true if we can vectorize this loop while folding its tail by
+  /// masking.
+  bool canFoldTailByMasking();
+
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
 
@@ -316,6 +320,9 @@
   /// requires mask.
   bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
 
+  /// Returns true if all loop blocks should be masked to fold tail loop.
+  bool foldTailByMasking() const { return FoldTailByMasking; }
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -475,6 +482,9 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
+
+  /// All blocks of loop are to be masked to fold tail of scalar iterations.
+  bool FoldTailByMasking = false;
 };
 
 } // namespace llvm
Index: lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -790,7 +790,8 @@
 }
 
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
-  return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+  return (FoldTailByMasking ||
+          LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT));
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
@@ -1069,4 +1070,60 @@
   return Result;
 }
 
+bool LoopVectorizationLegality::canFoldTailByMasking() {
+
+  LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
+
+  if (!PrimaryInduction) {
+    ORE->emit(createMissedAnalysis("NoPrimaryInduction")
+              << "Missing a primary induction variable in the loop, which is "
+              << "needed in order to fold tail by masking as required.");
+    LLVM_DEBUG(dbgs() << "LV: No primary induction, cannot fold tail by "
+                      << "masking.\n");
+    return false;
+  }
+
+  // TODO: handle reductions when tail is folded by masking.
+  if (!Reductions.empty()) {
+    ORE->emit(createMissedAnalysis("ReductionFoldingTailByMasking")
+              << "Cannot fold tail by masking in the presence of reductions.");
+    LLVM_DEBUG(dbgs() << "LV: Loop has reductions, cannot fold tail by "
+                      << "masking.\n");
+    return false;
+  }
+
+  // TODO: handle outside users when tail is folded by masking.
+  for (auto *AE : AllowedExit) {
+    // Check that all users of allowed exit values are inside the loop.
+    for (User *U : AE->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (TheLoop->contains(UI))
+        continue;
+      ORE->emit(createMissedAnalysis("LiveOutFoldingTailByMasking")
+                << "Cannot fold tail by masking in the presence of live outs.");
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop has an "
+                        << "outside user for : " << *UI << '\n');
+      return false;
+    }
+  }
+
+  // The list of pointers that we can safely read and write to remains empty.
+  SmallPtrSet<Value *, 8> SafePointers;
+
+  // Check and mark all blocks for predication, including those that ordinarily
+  // do not need predication such as the header block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!blockCanBePredicated(BB, SafePointers)) {
+      ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                << "control flow cannot be substituted for a select");
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as required.\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+  FoldTailByMasking = true;
+  return true;
+}
+
 } // namespace llvm
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -957,18 +957,28 @@
     : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
   ~InterleavedAccessInfo() {
+    reset();
+  }
+
+  /// Analyze the interleaved accesses and collect them in interleave
+  /// groups. Substitute symbolic strides using \p Strides.
+  void analyzeInterleaving();
+
+  /// Invalidate groups, e.g., in case all blocks in loop will be predicated
+  /// contrary to original assumption. Although we currently prevent group
+  /// formation for predicated accesses, we may be able to relax this limitation
+  /// in the future once we handle more complicated blocks.
+  void reset() {
     SmallPtrSet<InterleaveGroup *, 4> DelSet;
     // Avoid releasing a pointer twice.
     for (auto &I : InterleaveGroupMap)
       DelSet.insert(I.second);
     for (auto *Ptr : DelSet)
       delete Ptr;
+    InterleaveGroupMap.clear();
+    RequiresScalarEpilogue = false;
   }
 
-  /// Analyze the interleaved accesses and collect them in interleave
-  /// groups. Substitute symbolic strides using \p Strides.
-  void analyzeInterleaving();
-
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
     return InterleaveGroupMap.count(Instr);
@@ -2594,6 +2604,7 @@
   if (TripCount)
     return TripCount;
 
+  assert(L && "Create Trip Count for null loop.");
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
@@ -2642,12 +2653,23 @@
   Value *TC = getOrCreateTripCount(L);
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
 
+  Type *Ty = TC->getType();
+  Constant *Step = ConstantInt::get(Ty, VF * UF);
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  if (Legal->foldTailByMasking())
+    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+
   // Now we need to generate the expression for the part of the loop that the
   // vectorized body will execute. This is equal to N - (N % Step) if scalar
   // iterations are not required for correctness, or N - Step, otherwise. Step
   // is equal to the vectorization factor (number of SIMD elements) times the
   // unroll factor (number of SIMD instructions).
-  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
 
   // If there is a non-reversed interleaved group that may speculatively access
@@ -2710,8 +2732,13 @@
   // of zero. In this case we will also jump to the scalar loop.
   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
                                           : ICmpInst::ICMP_ULT;
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+ 
+  // If tail is to be folded, vector loop takes care of all iterations.
+  Value *CheckMinIters = Builder.getFalse();
+  if (!Legal->foldTailByMasking())
+    CheckMinIters = Builder.CreateICmp(
+        P, Count, ConstantInt::get(Count->getType(), VF * UF),
+        "min.iters.check");
 
   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   // Update dominator tree immediately if the generated block is a
@@ -2948,9 +2975,12 @@
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                      CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+  // If tail is to be folded, we know we don't need to run the remainder.
+  Value *CmpN = Builder.getTrue();
+  if (!Legal->foldTailByMasking())
+    CmpN =
+        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                        CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
   ReplaceInstWithInst(MiddleBlock->getTerminator(),
                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
 
@@ -4915,36 +4945,37 @@
   // If we optimize the program for size, avoid creating the tail loop.
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
-  // If we don't know the precise trip count, don't try to vectorize.
-  if (TC < 2) {
-    ORE->emit(
-        createMissedAnalysis("UnknownLoopCountComplexCFG")
-        << "unable to calculate the loop count due to complex control flow");
-    LLVM_DEBUG(
-        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+  if (TC == 1) {
+    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
     return None;
   }
 
   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
 
-  if (TC % MaxVF != 0) {
-    // If the trip count that we found modulo the vectorization factor is not
-    // zero then we require a tail.
-    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
-    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
-    //        smaller MaxVF that does not require a scalar epilog.
-
-    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-              << "cannot optimize for size and vectorize at the "
-                 "same time. Enable vectorization of this loop "
-                 "with '#pragma clang loop vectorize(enable)' "
-                 "when compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+  if (TC > 0 && TC % MaxVF == 0) {
+    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+    return MaxVF;
+  }
+ 
+  if (TC > 0 && TC < TinyTripCountInterleaveThreshold) {
+    ORE->emit(createMissedAnalysis("TinyTripCount")
+              << "The trip count of the loop is below the given threshold for "
+              << "loops with scalar iterations.");
+    LLVM_DEBUG(dbgs() << "LV: Aborting - trip count below given threshold for "
+                      << "loop with scalar iterations.\n");
     return None;
   }
 
-  return MaxVF;
+  // If we don't know the precise trip count, or if the trip count that we
+  // found modulo the vectorization factor is not zero, try to fold the tail
+  // by masking.
+  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+  // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
+  //        smaller MaxVF that does not require a scalar epilog.
+  if (Legal->canFoldTailByMasking())
+    return MaxVF;
+
+  return None;
 }
 
 unsigned
@@ -5182,6 +5213,9 @@
   // fit without causing spills. All of this is rounded down if necessary to be
   // a power of two. We want power of two interleave count to simplify any
   // addressing operations or alignment considerations.
+  // We also want power of two interleave counts to ensure that the induction
+  // variable of the vector loop wraps to zero, when tail is folded by masking;
+  // this currently happens when OptForSize, inwhich case IC is set to 1 above.
   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
                               R.MaxLocalUsers);
 
@@ -6316,6 +6350,10 @@
   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
     return NoVectorization;
 
+  // Invalidate interleave groups if all blocks of loop will be predicated.
+  if (Legal->blockNeedsPredication(OrigLoop->getHeader()))
+    CM.InterleaveInfo.reset();
+
   if (UserVF) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
@@ -6372,6 +6410,7 @@
                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
                          &ILV,   CallbackILV};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+  State.TripCount = ILV.getOrCreateTripCount(nullptr);
 
   //===------------------------------------------------===//
   //
@@ -6551,9 +6590,17 @@
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
 
-  // Loop incoming mask is all-one.
-  if (OrigLoop->getHeader() == BB)
+  if (OrigLoop->getHeader() == BB) {
+    if (!Legal->blockNeedsPredication(BB))
+      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
+    // Introduce the early-exit compare IV <= BTC to form header block mask.
+    // This is used instead of IV < TC because TC may wrap, unlike BTC.
+    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    VPValue *BTC = Plan->getBackedgeTakenCount();
+    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
+  }
 
   // This is the block mask. We OR all incoming edges.
   for (auto *Predecessor : predecessors(BB)) {
@@ -6908,6 +6955,11 @@
       NeedDef.insert(Branch->getCondition());
   }
 
+  // If the tail is to be folded by masking, the primary induction variable
+  // needs to be represented in VPlan for it to model early-exit masking.
+  if (Legal->foldTailByMasking())
+    NeedDef.insert(Legal->getPrimaryInduction());
+
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
   // example, original induction update instructions can become dead because we
Index: lib/Transforms/Vectorize/VPlan.h
===================================================================
--- lib/Transforms/Vectorize/VPlan.h
+++ lib/Transforms/Vectorize/VPlan.h
@@ -313,6 +313,9 @@
   /// Values they correspond to.
   VPValue2ValueTy VPValue2Value;
 
+  /// Hold the trip count of the scalar loop.
+  Value *TripCount = nullptr;
+
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
@@ -603,7 +606,7 @@
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
-  enum { Not = Instruction::OtherOpsEnd + 1 };
+  enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE };
 
 private:
   typedef unsigned char OpcodeTy;
@@ -1107,6 +1110,10 @@
   // (operators '==' and '<').
   SmallPtrSet<VPValue *, 16> VPExternalDefs;
 
+  /// Represents the backedge taken count of the original loop, for folding
+  /// the tail.
+  VPValue *BackedgeTakenCount;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -1115,13 +1122,17 @@
   VPLoopInfo VPLInfo;
 
 public:
-  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {}
+  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+    BackedgeTakenCount = new VPValue();
+  }
 
   ~VPlan() {
     if (Entry)
       VPBlockBase::deleteCFG(Entry);
     for (auto &MapEntry : Value2VPValue)
-      delete MapEntry.second;
+      if (MapEntry.second != BackedgeTakenCount)
+        delete MapEntry.second;
+    delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not.
     for (VPValue *Def : VPExternalDefs)
       delete Def;
   }
@@ -1134,6 +1145,9 @@
 
   VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
 
+  /// The backedge taken count of the original loop.
+  VPValue *getBackedgeTakenCount() { return BackedgeTakenCount; }
+
   void addVF(unsigned VF) { VFs.insert(VF); }
 
   bool hasVF(unsigned VF) { return VFs.count(VF); }
Index: lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- lib/Transforms/Vectorize/VPlan.cpp
+++ lib/Transforms/Vectorize/VPlan.cpp
@@ -249,6 +249,13 @@
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::ICmpULE: {
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateICmpULE(IV, TC);
+    State.set(this, V, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -274,6 +281,9 @@
   case VPInstruction::Not:
     O << "not";
     break;
+  case VPInstruction::ICmpULE:
+    O << "icmp ule";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -288,6 +298,15 @@
 /// LoopVectorBody basic-block was created for this. Introduce additional
 /// basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
+  // -1. Check if the backedge taken count is needed, and if so build it.
+  if (BackedgeTakenCount->getNumUsers()) {
+    Value *TC = State->TripCount;
+    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
+    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+                                   "trip.count.minus.1");
+    Value2VPValue[TCMO] = BackedgeTakenCount;
+  }
+
   // 0. Set the reverse mapping from VPValues to Values for code generation.
   for (auto &Entry : Value2VPValue)
     State->VPValue2Value[Entry.second] = Entry.first;
@@ -392,8 +411,9 @@
   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
   if (!Plan.getName().empty())
     OS << "\\n" << DOT::EscapeString(Plan.getName());
+  OS << ", where: \\n" << *Plan.getBackedgeTakenCount()
+     << " := BackedgeTakenCount";
   if (!Plan.Value2VPValue.empty()) {
-    OS << ", where:";
     for (auto Entry : Plan.Value2VPValue) {
       OS << "\\n" << *Entry.second;
       OS << DOT::EscapeString(" := ");
Index: test/Transforms/LoopVectorize/X86/optsize.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/optsize.ll
+++ test/Transforms/LoopVectorize/X86/optsize.ll
@@ -0,0 +1,57 @@
+; This test verifies that the loop vectorizer WILL vectorize WITHOUT producing
+; a tail loop, with the optimize for size or the minimize size attributes,
+; using masking to fold the tail.
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK: x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK: x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
Index: test/Transforms/LoopVectorize/X86/small-size.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/small-size.ll
+++ test/Transforms/LoopVectorize/X86/small-size.ll
@@ -46,9 +46,12 @@
   ret void
 }
 
-; Can't vectorize in 'optsize' mode because we need a tail.
+; We can vectorize the first loop in 'optsize' mode by masking its tail.
+; Can't vectorize the second loop because it has no primary induction.
 ;CHECK-LABEL: @example2(
-;CHECK-NOT: store <4 x i32>
+;CHECK: <4 x i32>
+;CHECK: middle.block
+;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
 define void @example2(i32 %n, i32 %x) optsize {
   %1 = icmp sgt i32 %n, 0
@@ -91,7 +94,8 @@
   ret void
 }
 
-; N is unknown, we need a tail. Can't vectorize.
+; N is unknown, we need a tail. Can't vectorize by masking it because the loop
+; has no primary induction.
 ;CHECK-LABEL: @example3(
 ;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
@@ -166,4 +170,54 @@
   ret void
 }
 
+; We CAN vectorize this example by folding the tail using masking.
+;CHECK-LABEL: @example23c(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN'T vectorize this example because an induction variable is used outside
+; the loop.
+;CHECK-LABEL: @example23d(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i64
+define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i64 %7
+}