Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1233,15 +1233,6 @@
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
-  if (!PrimaryInduction) {
-    reportVectorizationFailure(
-        "No primary induction, cannot fold tail by masking",
-        "Missing a primary induction variable in the loop, which is "
-        "needed in order to fold tail by masking as required.",
-        "NoPrimaryInduction", ORE, TheLoop);
-    return false;
-  }
-
   SmallPtrSet<const Value *, 8> ReductionLiveOuts;
 
   for (auto &Reduction : getReductionVars())
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6584,6 +6584,7 @@
                          &ILV,   CallbackILV};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
+  State.VectorLoopScalarIV = ILV.Induction;
 
   //===------------------------------------------------===//
   //
@@ -6770,7 +6771,15 @@
 
     // Introduce the early-exit compare IV <= BTC to form header block mask.
     // This is used instead of IV < TC because TC may wrap, unlike BTC.
-    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    // Start by constructing the desired canonical IV.
+    VPValue *IV = nullptr;
+    if (Legal->getPrimaryInduction())
+      IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    else {
+      auto IVRecipe = new VPWidenPrimaryInductionRecipe();
+      Builder.getInsertBlock()->appendRecipe(IVRecipe);
+      IV = IVRecipe->getVPValue();
+    }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
@@ -7130,7 +7139,8 @@
   // Also, both the Phi and the live-out instruction of each reduction are
   // required in order to introduce a select between them in VPlan.
   if (CM.foldTailByMasking()) {
-    NeedDef.insert(Legal->getPrimaryInduction());
+    if (Legal->getPrimaryInduction())
+      NeedDef.insert(Legal->getPrimaryInduction());
     for (auto &Reduction : Legal->getReductionVars()) {
       NeedDef.insert(Reduction.first);
       NeedDef.insert(Reduction.second.getLoopExitInstr());
@@ -7566,9 +7576,8 @@
                               !PreferPredicateOverEpilog;
 
   // 2) Next, if disabling predication is requested on the command line, honour
-  // this and request a scalar epilogue. Also do this if we don't have a
-  // primary induction variable, which is required for predication.
-  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+  // this and request a scalar epilogue.
+  if (PredicateOptDisabled)
     return CM_ScalarEpilogueAllowed;
 
   // 3) and 4) look if enabling predication is requested on the command line,
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -329,6 +329,9 @@
   /// Values they correspond to.
   VPValue2ValueTy VPValue2Value;
 
+  /// Hold the scalar IV of the vector loop (start=0, step=VF*UF).
+  Value *VectorLoopScalarIV = nullptr;
+
   /// Hold the trip count of the scalar loop.
   Value *TripCount = nullptr;
 
@@ -614,6 +617,7 @@
     VPWidenIntOrFpInductionSC,
     VPWidenMemoryInstructionSC,
     VPWidenPHISC,
+    VPWidenPrimaryInductionSC,
     VPWidenSC,
   };
 
@@ -1139,6 +1143,35 @@
              VPSlotTracker &SlotTracker) const override;
 };
 
+/// A Recipe for widening the primary induction of the vector loop.
+class VPWidenPrimaryInductionRecipe : public VPRecipeBase {
+private:
+  VPValue *Val = nullptr;
+
+public:
+  VPWidenPrimaryInductionRecipe() : VPRecipeBase(VPWidenPrimaryInductionSC) {
+    Val = new VPValue();
+  }
+  ~VPWidenPrimaryInductionRecipe() override { delete Val; }
+
+  /// Return the VPValue representing the vector primary induction variable of
+  /// the vector loop.
+  VPValue *getVPValue() const { return Val; }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenPrimaryInductionSC;
+  }
+
+  /// Generate a canonical vector induction variable of the vector loop, with
+  /// start = <0,1,...,VF-1> and step = <VF,VF,...,VF>.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
 /// output IR instructions.
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -802,6 +802,29 @@
   O << "\\l\"";
 }
 
+void VPWidenPrimaryInductionRecipe::execute(VPTransformState &State) {
+  Value *VLSIV = State.VectorLoopScalarIV;
+  Type *STy = VLSIV->getType();
+  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+  Value *VStart = Builder.CreateVectorSplat(State.VF, VLSIV, "broadcast");
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    SmallVector<Constant *, 8> Indices;
+    for (unsigned Lane = 0, VF = State.VF; Lane < VF; ++Lane)
+      Indices.push_back(ConstantInt::get(STy, Part * VF + Lane));
+    Constant *VStep = ConstantVector::get(Indices);
+    // Add the consecutive indices to the vector value.
+    Value *VLVIV = Builder.CreateAdd(VStart, VStep, "vlviv");
+    State.set(getVPValue(), VLVIV, Part);
+  }
+}
+
+void VPWidenPrimaryInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                          VPSlotTracker &SlotTracker) const {
+  O << " +\n" << Indent << "\"EMIT ";
+  getVPValue()->printAsOperand(O, SlotTracker);
+  O << " = WIDEN-PRIMARY-INDUCTION \\l\"";
+}
+
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
@@ -901,6 +924,9 @@
   for (const VPRecipeBase &Recipe : *VPBB) {
     if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe))
       assignSlot(VPI);
+    else if (const auto *VPIV =
+                 dyn_cast<VPWidenPrimaryInductionRecipe>(&Recipe))
+      assignSlot(VPIV->getVPValue());
   }
 }
 
Index: llvm/test/Transforms/LoopVectorize/X86/small-size.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -171,6 +171,14 @@
 ; N is unknown, we need a tail. Can't vectorize because loop has no primary
 ; induction.
 ;CHECK-LABEL: @example3(
+; CHECK:       vector.ph:
+; CHECK:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> {{.*}}, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0,
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VPIV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK:    {{.*}} = icmp ule <4 x i64> [[VPIV]], [[BROADCAST_SPLAT2]]
 ;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
 define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
@@ -237,12 +245,12 @@
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
-; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !11
 ; CHECK:         ret void
 ;
   br label %1
@@ -353,12 +361,12 @@
 ; CHECK:       pred.store.continue22:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
-; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[TMP33:%.*]]
-; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !13
 ; CHECK:         ret void
 ;
   br label %1
Index: llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
+++ llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -1,12 +1,11 @@
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -force-vector-width=4 -S | FileCheck %s
 
-; Check that when we can't predicate this loop that it is still vectorised (with
-; an epilogue).
-; TODO: the reason this can't be predicated is because a primary induction
-; variable can't be found (not yet) for this counting down loop. But with that
-; fixed, this should be able to be predicated.
+; Check that a counting-down loop which has no primary induction variable
+; is vectorized with preferred predication.
 
 ; CHECK-LABEL: vector.body:
+; CHECK-LABEL: middle.block:
+; CHECK-NEXT:    br i1 true,
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"