Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7502,30 +7502,40 @@
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
+// Determine how to lower the scalar epilogue, which depends if we optimise
+// for minimum code-size, if options or loop hints forcing predication are set,
+// and a TTI hook that analyses whether the loop is suitable for predication.
 static ScalarEpilogueLowering
 getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
                           TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
                           AssumptionCache *AC, LoopInfo *LI,
                           ScalarEvolution *SE, DominatorTree *DT,
-                          const LoopAccessInfo *LAI) {
-  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+                          LoopVectorizationLegality *LVL) {
+  bool OptSize = F->hasOptSize() ||
+    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                PGSOQueryType::IRPass);
+  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && OptSize)
+    return CM_ScalarEpilogueNotAllowedOptSize;
+
+  // If we don't have a primary induction variable, don't try to predicate the
+  // vector body because for this an induction variable is required.
+  // Vectorisation would fail, which is not what we want if the loop could be
+  // vectorised with a scalar epilogue.
+  if (!LVL->getPrimaryInduction())
+    return CM_ScalarEpilogueAllowed;
+
   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
                               !PreferPredicateOverEpilog;
-
-  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                   PGSOQueryType::IRPass)))
-    SEL = CM_ScalarEpilogueNotAllowedOptSize;
-  else if (PreferPredicateOverEpilog ||
+  if (PreferPredicateOverEpilog ||
            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
-           (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
+           (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+                                             LVL->getLAI()) &&
             Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
             !PredicateOptDisabled))
-    SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+    return CM_ScalarEpilogueNotNeededUsePredicate;
 
-  return SEL;
+  return CM_ScalarEpilogueAllowed;
 }
 
 // Process the loop in the VPlan-native vectorization path. This path builds
@@ -7545,7 +7555,7 @@
 
   ScalarEpilogueLowering SEL =
     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL->getLAI());
+                              PSE.getSE(), DT, LVL);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
@@ -7639,7 +7649,7 @@
   // should be optimized for size.
   ScalarEpilogueLowering SEL =
     getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL.getLAI());
+                              PSE.getSE(), DT, &LVL);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) #0 {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
Index: llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}