Index: llvm/lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -129,11 +129,17 @@
     MaxBytesForLoopAlignment = 8;
     break;
   case CortexA710:
+    PrefFunctionLogAlignment = 4;
+    VScaleForTuning = 1;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
+    break;
   case CortexX2:
     PrefFunctionLogAlignment = 4;
     VScaleForTuning = 1;
     PrefLoopLogAlignment = 5;
     MaxBytesForLoopAlignment = 16;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case A64FX:
     CacheLineSize = 256;
@@ -144,6 +150,7 @@
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
     VScaleForTuning = 4;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case AppleA7:
   case AppleA10:
@@ -200,6 +207,7 @@
     PrefLoopLogAlignment = 5;
     MaxBytesForLoopAlignment = 16;
     VScaleForTuning = 2;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case Neoverse512TVB:
     PrefFunctionLogAlignment = 4;
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -40,29 +40,27 @@
 
 class TailFoldingKind {
 private:
-  uint8_t Bits = 0; // Currently defaults to disabled.
+  uint8_t DefaultBits = TFAll;
+  uint8_t AddBits = 0;
+  uint8_t RemoveBits = 0;
+  bool NeedsDefault = true;
 
 public:
-  enum TailFoldingOpts {
-    TFDisabled = 0x0,
-    TFReductions = 0x01,
-    TFRecurrences = 0x02,
-    TFSimple = 0x80,
-    TFAll = TFReductions | TFRecurrences | TFSimple
-  };
+  void setDefault(uint8_t V) { DefaultBits = V; }
 
   void operator=(const std::string &Val) {
     if (Val.empty())
       return;
     SmallVector<StringRef, 6> TailFoldTypes;
     StringRef(Val).split(TailFoldTypes, '+', -1, false);
+    NeedsDefault = false;
     for (auto TailFoldType : TailFoldTypes) {
       if (TailFoldType == "disabled")
-        Bits = 0;
+        remove(TFAll);
       else if (TailFoldType == "all")
-        Bits = TFAll;
+        add(TFAll);
       else if (TailFoldType == "default")
-        Bits = 0; // Currently defaults to never tail-folding.
+        NeedsDefault = true;
       else if (TailFoldType == "simple")
         add(TFSimple);
       else if (TailFoldType == "reductions")
@@ -83,14 +81,32 @@
     }
   }
 
-  operator uint8_t() const { return Bits; }
+  operator uint8_t() const {
+    uint8_t Bits = NeedsDefault ? DefaultBits : 0;
+    Bits |= AddBits;
+    Bits &= ~RemoveBits;
+    return Bits;
+  }
+
+  void add(uint8_t Flags) {
+    AddBits |= Flags;
+    RemoveBits &= ~Flags;
+  }
 
-  void add(uint8_t Flag) { Bits |= Flag; }
-  void remove(uint8_t Flag) { Bits &= ~Flag; }
+  void remove(uint8_t Flags) {
+    RemoveBits |= Flags;
+    AddBits &= ~Flags;
+  }
 };
 
 TailFoldingKind TailFoldingKindLoc;
 
+namespace llvm {
+void setSVETailFoldingDefaultOpts(uint8_t V) {
+  TailFoldingKindLoc.setDefault(V);
+}
+} // namespace llvm
+
 cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
     "sve-tail-folding",
     cl::desc(
@@ -3029,7 +3045,7 @@
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
     InterleavedAccessInfo *IAI) {
-  if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
+  if (!ST->hasSVE() || TailFoldingKindLoc == TFDisabled)
     return false;
 
   // We don't currently support vectorisation with interleaving for SVE - with
@@ -3039,12 +3055,13 @@
     return false;
 
   TailFoldingKind Required; // Defaults to 0.
+  Required.setDefault(0);
   if (LVL->getReductionVars().size())
-    Required.add(TailFoldingKind::TFReductions);
+    Required.add(TFReductions);
   if (LVL->getFirstOrderRecurrences().size())
-    Required.add(TailFoldingKind::TFRecurrences);
+    Required.add(TFRecurrences);
   if (!Required)
-    Required.add(TailFoldingKind::TFSimple);
+    Required.add(TFSimple);
 
   return (TailFoldingKindLoc & Required) == Required;
 }
Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
===================================================================
--- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -511,6 +511,16 @@
   }
 }
 
+enum TailFoldingOpts : uint8_t {
+  TFDisabled = 0x0,
+  TFReductions = 0x01,
+  TFRecurrences = 0x02,
+  TFSimple = 0x80,
+  TFAll = TFReductions | TFRecurrences | TFSimple
+};
+
+void setSVETailFoldingDefaultOpts(uint8_t);
+
 namespace AArch64ExactFPImm {
   struct ExactFPImm {
     const char *Name;
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9744,6 +9744,11 @@
     return CM_ScalarEpilogueAllowed;
   };
 
+  // If we're forcing the use of epilogue vectorization we should honour that
+  // instead of the TTI hook behaviour.
+  if (EpilogueVectorizationForceVF.getNumOccurrences())
+    return CM_ScalarEpilogueAllowed;
+
   // 4) if the TTI hook indicates this is profitable, request predication.
   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
     return CM_ScalarEpilogueNotNeededUsePredicate;
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -1,10 +1,14 @@
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
-; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s -check-prefix=CHECK-TF
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default+disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt < %s -loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences | FileCheck %s -check-prefix=CHECK-TF
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
-; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
+; RUN: opt < %s -loop-vectorize -sve-tail-folding=default+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
 ; RUN: opt < %s -loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
+; RUN: opt < %s -loop-vectorize -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
+; RUN: opt < %s -loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -49,6 +53,14 @@
 ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
 
+; CHECK-NEOVERSE-V1-LABEL: @simple_memset(
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i32 0
+; CHECK-NEOVERSE-V1:         %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
 entry:
   br label %while.body
 
@@ -110,6 +122,15 @@
 ; CHECK-TF-ONLYRED:         %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
 ; CHECK-TF-ONLYRED:       middle.block:
 ; CHECK-TF-ONLYRED-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
+; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <vscale x 4 x float>
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-NEOVERSE-V1:       middle.block:
+; CHECK-NEOVERSE-V1-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+
 entry:
   br label %for.body
 
@@ -193,6 +214,19 @@
 ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
 
+; CHECK-NEOVERSE-V1-LABEL: @add_recur
+; CHECK-NEOVERSE-V1:       entry:
+; CHECK-NEOVERSE-V1:         %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
+
 entry:
   %.pre = load i32, i32* %src, align 4
   br label %for.body
@@ -238,6 +272,12 @@
 ; CHECK-TF-NOREC:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-TF-NOREC:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 
+; CHECK-NEOVERSE-V1-LABEL: @interleave(
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <8 x float>, <8 x float>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
 entry:
   br label %for.body