Index: llvm/lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -226,6 +226,7 @@
     PrefLoopAlignment = Align(32);
     MaxBytesForLoopAlignment = 16;
     VScaleForTuning = 2;
+    setSVETailFoldingDefaultOpts(TFSimple);
     break;
   case Neoverse512TVB:
     PrefFunctionAlignment = Align(16);
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -42,64 +42,97 @@
 namespace {
 class TailFoldingKind {
 private:
-  uint8_t Bits = 0; // Currently defaults to disabled.
+  uint8_t Bits;
 
 public:
-  enum TailFoldingOpts {
-    TFDisabled = 0x0,
-    TFReductions = 0x01,
-    TFRecurrences = 0x02,
-    TFReverse = 0x04,
-    TFSimple = 0x80,
-    TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple
-  };
+  TailFoldingKind(uint8_t Bits) : Bits(Bits) {}
 
-  void operator=(const std::string &Val) {
-    if (Val.empty())
-      return;
-    SmallVector<StringRef, 6> TailFoldTypes;
-    StringRef(Val).split(TailFoldTypes, '+', -1, false);
+  void add(uint8_t Bit) { Bits |= Bit; }
+
+  void remove(uint8_t Bit) { Bits &= ~Bit; }
+
+  operator uint8_t() const { return Bits; }
+};
+
+class TailFoldingOption {
+private:
+  uint8_t DefaultBits = TFDisabled;
+  std::string OrigVal;
+  SmallVector<StringRef, 4> TailFoldTypes;
+
+  uint8_t getBits() const {
+    if (!TailFoldTypes.size())
+      return DefaultBits;
+
+    TailFoldingKind Bits(0);
     for (auto TailFoldType : TailFoldTypes) {
       if (TailFoldType == "disabled")
-        Bits = 0;
+        Bits.remove(TFAll);
       else if (TailFoldType == "all")
-        Bits = TFAll;
+        Bits.add(TFAll);
       else if (TailFoldType == "default")
-        Bits = 0; // Currently defaults to never tail-folding.
+        Bits.add(DefaultBits);
       else if (TailFoldType == "simple")
-        add(TFSimple);
+        Bits.add(TFSimple);
       else if (TailFoldType == "reductions")
-        add(TFReductions);
+        Bits.add(TFReductions);
       else if (TailFoldType == "recurrences")
-        add(TFRecurrences);
+        Bits.add(TFRecurrences);
       else if (TailFoldType == "reverse")
-        add(TFReverse);
+        Bits.add(TFReverse);
       else if (TailFoldType == "noreductions")
-        remove(TFReductions);
+        Bits.remove(TFReductions);
       else if (TailFoldType == "norecurrences")
-        remove(TFRecurrences);
+        Bits.remove(TFRecurrences);
       else if (TailFoldType == "noreverse")
-        remove(TFReverse);
-      else {
+        Bits.remove(TFReverse);
+      else
+        llvm_unreachable("No! That's impossible!");
+    }
+
+    return Bits;
+  }
+
+public:
+
+  void setDefault(uint8_t V) { DefaultBits = V; }
+
+  void operator=(const std::string &Val) {
+    if (Val.empty())
+      return;
+
+    OrigVal = Val;
+    StringRef(OrigVal).split(TailFoldTypes, '+', -1, false);
+    for (auto TailFoldType : TailFoldTypes) {
+      if (TailFoldType != "disabled" && TailFoldType != "all" &&
+          TailFoldType != "default" && TailFoldType != "simple" &&
+          TailFoldType != "reductions" && TailFoldType != "recurrences" &&
+          TailFoldType != "reverse" && TailFoldType != "noreductions" &&
+          TailFoldType != "norecurrences" && TailFoldType != "noreverse") {
         errs()
             << "invalid argument " << TailFoldType.str()
             << " to -sve-tail-folding=; each element must be one of: disabled, "
                "all, default, simple, reductions, noreductions, recurrences, "
-               "norecurrences\n";
+               "norecurrences, reverse, noreverse\n";
       }
     }
   }
 
-  operator uint8_t() const { return Bits; }
-
-  void add(uint8_t Flag) { Bits |= Flag; }
-  void remove(uint8_t Flag) { Bits &= ~Flag; }
+  bool satisfies(TailFoldingKind Required) const {
+    return (getBits() & Required) == Required;
+  }
 };
 } // namespace
 
-TailFoldingKind TailFoldingKindLoc;
+TailFoldingOption TailFoldingOptionLoc;
+
+namespace llvm {
+void setSVETailFoldingDefaultOpts(uint8_t V) {
+  TailFoldingOptionLoc.setDefault(V);
+}
+} // namespace llvm
 
-cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
+cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
     "sve-tail-folding",
     cl::desc(
         "Control the use of vectorisation using tail-folding for SVE:"
@@ -114,7 +147,7 @@
         "recurrences"
         "\nreverse     Use tail-folding for loops requiring reversed "
         "predicates"),
-    cl::location(TailFoldingKindLoc));
+    cl::location(TailFoldingOptionLoc));
 
 // Experimental option that will only be fully functional when the
 // code-generator is changed to use SVE instead of NEON for all fixed-width
@@ -3450,7 +3483,7 @@
 }
 
 bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
-  if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
+  if (!ST->hasSVE())
     return false;
 
   // We don't currently support vectorisation with interleaving for SVE - with
@@ -3459,22 +3492,22 @@
   if (TFI->IAI->hasGroups())
     return false;
 
-  TailFoldingKind Required; // Defaults to 0.
+  TailFoldingKind Required(0);
   if (TFI->LVL->getReductionVars().size())
-    Required.add(TailFoldingKind::TFReductions);
+    Required.add(TFReductions);
   if (TFI->LVL->getFixedOrderRecurrences().size())
-    Required.add(TailFoldingKind::TFRecurrences);
+    Required.add(TFRecurrences);
 
   // We call this to discover whether any load/store pointers in the loop have
   // negative strides. This will require extra work to reverse the loop
   // predicate, which may be expensive.
   if (containsDecreasingPointers(TFI->LVL->getLoop(),
                                  TFI->LVL->getPredicatedScalarEvolution()))
-    Required.add(TailFoldingKind::TFReverse);
+    Required.add(TFReverse);
   if (!Required)
-    Required.add(TailFoldingKind::TFSimple);
+    Required.add(TFSimple);
 
-  return (TailFoldingKindLoc & Required) == Required;
+  return TailFoldingOptionLoc.satisfies(Required);
 }
 
 InstructionCost
Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
===================================================================
--- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -529,6 +529,17 @@
   }
 }
 
+enum TailFoldingOpts : uint8_t {
+  TFDisabled = 0x0,
+  TFReductions = 0x01,
+  TFRecurrences = 0x02,
+  TFReverse = 0x04,
+  TFSimple = 0x80,
+  TFAll = TFReductions | TFRecurrences | TFSimple | TFReverse
+};
+
+void setSVETailFoldingDefaultOpts(uint8_t);
+
 namespace AArch64ExactFPImm {
   struct ExactFPImm {
     const char *Name;
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
-; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
-; RUN:   -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG
+; RUN:   -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
 ; RUN:   -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
 ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll
@@ -1,11 +1,15 @@
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S | FileCheck %s -check-prefix=CHECK-NOTF
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF
-; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default+disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences+reverse | FileCheck %s -check-prefix=CHECK-TF
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV
 ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
+; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -58,6 +62,14 @@
 ; CHECK-TF-ONLYRED-NOT:     %{{.*}} = phi <vscale x 4 x i1>
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[SPLAT]], <vscale x 4 x i32>*
 
+; CHECK-NEOVERSE-V1-LABEL: @simple_memset(
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[INSERT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %val, i64 0
+; CHECK-NEOVERSE-V1:         %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[INSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[ACTIVE_LANE_MASK:.*]] = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]]
+
 entry:
   br label %while.body
 
@@ -129,6 +141,15 @@
 ; CHECK-TF-ONLYRED:         %[[SEL:.*]] = select fast <vscale x 4 x i1> %[[ACTIVE_LANE_MASK]], <vscale x 4 x float> %[[ADD]], <vscale x 4 x float> %[[VEC_PHI]]
 ; CHECK-TF-ONLYRED:       middle.block:
 ; CHECK-TF-ONLYRED-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[SEL]])
+
+; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <vscale x 4 x float>
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = fadd fast <vscale x 4 x float> %[[LOAD]]
+; CHECK-NEOVERSE-V1:       middle.block:
+; CHECK-NEOVERSE-V1-NEXT:    call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[ADD]])
+
 entry:
   br label %for.body
 
@@ -225,6 +246,19 @@
 ; CHECK-TF-ONLYRED:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
 ; CHECK-TF-ONLYRED:         store <vscale x 4 x i32> %[[ADD]]
 
+; CHECK-NEOVERSE-V1-LABEL: @add_recur
+; CHECK-NEOVERSE-V1:       entry:
+; CHECK-NEOVERSE-V1:         %[[PRE:.*]] = load i32, i32* %src, align 4
+; CHECK-NEOVERSE-V1:       vector.ph:
+; CHECK-NEOVERSE-V1:         %[[RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[PRE]]
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-NEOVERSE-V1:         %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
+; CHECK-NEOVERSE-V1:         %[[LOAD]] = load <vscale x 4 x i32>
+; CHECK-NEOVERSE-V1:         %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VECTOR_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
+; CHECK-NEOVERSE-V1:         %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[LOAD]], %[[SPLICE]]
+; CHECK-NEOVERSE-V1:         store <vscale x 4 x i32> %[[ADD]]
+
 entry:
   %.pre = load i32, i32* %src, align 4
   br label %for.body
@@ -276,6 +310,12 @@
 ; CHECK-TF-NOREV:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-TF-NOREV:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 
+; CHECK-NEOVERSE-V1-LABEL: @interleave(
+; CHECK-NEOVERSE-V1:       vector.body:
+; CHECK-NEOVERSE-V1:         %[[LOAD:.*]] = load <8 x float>, <8 x float>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEOVERSE-V1:         %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
 entry:
   br label %for.body
 
@@ -335,6 +375,12 @@
 ; CHECK-TF-NOREC:         %[[REVERSE_MASK:.*]] = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %[[ACTIVE_LANE_MASK]])
 ; CHECK-TF-NOREC:         %[[MASKED_LOAD:.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0nxv2f64(<vscale x 2 x double>{{.*}} <vscale x 2 x i1> %reverse
 
+; CHECK-TF-NEOVERSE-V1-LABEL: @reverse(
+; CHECK-TF-NEOVERSE-V1:       vector.body:
+; CHECK-TF-NEOVERSE-V1-NOT:     %{{.*}} = phi <vscale x 4 x i1>
+; CHECK-TF-NEOVERSE-V1:         %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* %18, align 8
+; CHECK-TF-NEOVERSE-V1:         %{{.*}} = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %[[LOAD]])
+
 entry:
   br label %for.body