Index: llvm/lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -129,11 +129,17 @@
     MaxBytesForLoopAlignment = 8;
     break;
   case CortexA710:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
+    break;
   case CortexX2:
     PrefFunctionLogAlignment = 4;
     VScaleForTuning = 1;
     PrefLoopLogAlignment = 5;
     MaxBytesForLoopAlignment = 16;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case A64FX:
     CacheLineSize = 256;
@@ -144,6 +150,7 @@
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
     VScaleForTuning = 4;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case AppleA7:
   case AppleA10:
@@ -200,6 +207,7 @@
     PrefLoopLogAlignment = 5;
     MaxBytesForLoopAlignment = 16;
     VScaleForTuning = 2;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case Neoverse512TVB:
     PrefFunctionLogAlignment = 4;
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -40,40 +40,67 @@
 
 class TailFoldingKind {
 private:
-  uint8_t Bits = 0; // Currently defaults to disabled.
+  uint8_t Bits;
 
 public:
-  enum TailFoldingOpts {
-    TFDisabled = 0x0,
-    TFReductions = 0x01,
-    TFRecurrences = 0x02,
-    TFSimple = 0x80,
-    TFAll = TFReductions | TFRecurrences | TFSimple
-  };
+  TailFoldingKind(uint8_t Bits) : Bits(Bits) {}
 
-  void operator=(const std::string &Val) {
-    if (Val.empty())
-      return;
-    SmallVector<StringRef, 6> TailFoldTypes;
-    StringRef(Val).split(TailFoldTypes, '+', -1, false);
+  void add(uint8_t Bit) { Bits |= Bit; }
+
+  void remove(uint8_t Bit) { Bits &= ~Bit; }
+
+  operator uint8_t() const { return Bits; }
+};
+
+class TailFoldingOption {
+private:
+  uint8_t DefaultBits = TFAll;
+  std::string OrigVal;
+  SmallVector<StringRef, 4> TailFoldTypes;
+
+  uint8_t getBits() const {
+    if (!TailFoldTypes.size())
+      return DefaultBits;
+
+    TailFoldingKind Bits(0);
     for (auto TailFoldType : TailFoldTypes) {
       if (TailFoldType == "disabled")
-        Bits = 0;
+        Bits.remove(TFAll);
       else if (TailFoldType == "all")
-        Bits = TFAll;
+        Bits.add(TFAll);
       else if (TailFoldType == "default")
-        Bits = 0; // Currently defaults to never tail-folding.
+        Bits.add(DefaultBits);
       else if (TailFoldType == "simple")
-        add(TFSimple);
+        Bits.add(TFSimple);
       else if (TailFoldType == "reductions")
-        add(TFReductions);
+        Bits.add(TFReductions);
       else if (TailFoldType == "recurrences")
-        add(TFRecurrences);
+        Bits.add(TFRecurrences);
       else if (TailFoldType == "noreductions")
-        remove(TFReductions);
+        Bits.remove(TFReductions);
       else if (TailFoldType == "norecurrences")
-        remove(TFRecurrences);
-      else {
+        Bits.remove(TFRecurrences);
+      else
+        llvm_unreachable("No! That's impossible!");
+    }
+
+    return Bits;
+  }
+
+public:
+  void setDefault(uint8_t V) { DefaultBits = V; }
+
+  void operator=(const std::string &Val) {
+    if (Val.empty())
+      return;
+
+    OrigVal = Val;
+    StringRef(OrigVal).split(TailFoldTypes, '+', -1, false);
+    for (auto TailFoldType : TailFoldTypes) {
+      if (TailFoldType != "disabled" && TailFoldType != "all" &&
+          TailFoldType != "default" && TailFoldType != "simple" &&
+          TailFoldType != "reductions" && TailFoldType != "recurrences" &&
+          TailFoldType != "noreductions" && TailFoldType != "norecurrences") {
         errs()
             << "invalid argument " << TailFoldType.str()
             << " to -sve-tail-folding=; each element must be one of: disabled, "
@@ -83,15 +110,20 @@
     }
   }
 
-  operator uint8_t() const { return Bits; }
-
-  void add(uint8_t Flag) { Bits |= Flag; }
-  void remove(uint8_t Flag) { Bits &= ~Flag; }
+  bool satisfies(TailFoldingKind Required) const {
+    return (getBits() & Required) == Required;
+  }
 };
 
-TailFoldingKind TailFoldingKindLoc;
+TailFoldingOption TailFoldingOptionLoc;
+
+namespace llvm {
+void setSVETailFoldingDefaultOpts(uint8_t V) {
+  TailFoldingOptionLoc.setDefault(V);
+}
+} // namespace llvm
 
-cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
+cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
     "sve-tail-folding",
     cl::desc(
         "Control the use of vectorisation using tail-folding for SVE:"
@@ -104,7 +136,7 @@
         "\nreductions  Use tail-folding for loops containing reductions"
         "\nrecurrences Use tail-folding for loops containing first order "
         "recurrences"),
-    cl::location(TailFoldingKindLoc));
+    cl::location(TailFoldingOptionLoc));
 
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
@@ -3033,7 +3065,7 @@
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
     InterleavedAccessInfo *IAI) {
-  if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
+  if (!ST->hasSVE())
     return false;
 
   // We don't currently support vectorisation with interleaving for SVE - with
@@ -3042,13 +3074,13 @@
   if (IAI->hasGroups())
     return false;
 
-  TailFoldingKind Required; // Defaults to 0.
+  TailFoldingKind Required(0);
   if (LVL->getReductionVars().size())
-    Required.add(TailFoldingKind::TFReductions);
+    Required.add(TFReductions);
   if (LVL->getFirstOrderRecurrences().size())
-    Required.add(TailFoldingKind::TFRecurrences);
+    Required.add(TFRecurrences);
   if (!Required)
-    Required.add(TailFoldingKind::TFSimple);
+    Required.add(TFSimple);
 
-  return (TailFoldingKindLoc & Required) == Required;
+  return TailFoldingOptionLoc.satisfies(Required);
 }
Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
===================================================================
--- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -511,6 +511,16 @@
   }
 }
 
+enum TailFoldingOpts : uint8_t {
+  TFDisabled = 0x0,
+  TFReductions = 0x01,
+  TFRecurrences = 0x02,
+  TFSimple = 0x80,
+  TFAll = TFReductions | TFRecurrences | TFSimple
+};
+
+void setSVETailFoldingDefaultOpts(uint8_t);
+
 namespace AArch64ExactFPImm {
   struct ExactFPImm {
     const char *Name;
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9746,6 +9746,11 @@
     return CM_ScalarEpilogueAllowed;
   };
 
+  // If we're forcing the use of epilogue vectorization we should honour that
+  // instead of the TTI hook behaviour.
+  if (EpilogueVectorizationForceVF.getNumOccurrences())
+    return CM_ScalarEpilogueAllowed;
+
   // 4) if the TTI hook indicates this is profitable, request predication.
   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
     return CM_ScalarEpilogueNotNeededUsePredicate;
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -1,10 +1,14 @@
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
-; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s -check-prefix=CHECK-TF
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default+disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences | FileCheck %s -check-prefix=CHECK-TF
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
-; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
+; RUN: opt < %s -loop-vectorize -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
+; RUN: opt < %s -loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -49,6 +53,14 @@
 ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
 
+; CHECK-NEOVERSE-V1-LABEL: @simple_memset(
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-NEOVERSE-V1:         %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
 entry:
   br label %while.body
 
@@ -110,6 +122,15 @@
 ; CHECK-TF-ONLYRED:         %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
 ; CHECK-TF-ONLYRED:       middle.block:
 ; CHECK-TF-ONLYRED-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
+; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <vscale x 4 x float>
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-NEOVERSE-V1:       middle.block:
+; CHECK-NEOVERSE-V1-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+
 entry:
   br label %for.body
 
@@ -193,6 +214,19 @@
 ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
 
+; CHECK-NEOVERSE-V1-LABEL: @add_recur
+; CHECK-NEOVERSE-V1:       entry:
+; CHECK-NEOVERSE-V1:         %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
+
 entry:
   %.pre = load i32, i32* %src, align 4
   br label %for.body
@@ -238,6 +272,12 @@
 ; CHECK-TF-NOREC:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-TF-NOREC:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 
+; CHECK-NEOVERSE-V1-LABEL: @interleave(
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <8 x float>, <8 x float>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
 entry:
   br label %for.body