Index: llvm/include/llvm/Analysis/VectorUtils.h
===================================================================
--- llvm/include/llvm/Analysis/VectorUtils.h
+++ llvm/include/llvm/Analysis/VectorUtils.h
@@ -811,6 +811,8 @@
   /// cannot be filtered by masking the load/store.
   void invalidateGroupsRequiringScalarEpilogue();
 
+  bool hasGroups() const { return !InterleaveGroups.empty(); }
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5063,9 +5063,23 @@
 
   // Invalidate interleave groups that require an epilogue if we can't mask
   // the interleave-group.
-  if (!useMaskedInterleavedAccesses(TTI)) {
+  if (InterleaveInfo.hasGroups() && !useMaskedInterleavedAccesses(TTI)) {
     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
            "No decisions should have been taken at this point");
+
+    // If the target doesn't have masked interleaved accesses, then it's very
+    // likely the costs will be far too high to consider vectorising, e.g. see
+    // where useEmulatedMaskMemRefHack is used. If we're permitted to fall back
+    // on an unpredicated vector loop + scalar epilogue then let's do it now.
+    if (UserVF.isZero() &&
+        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+      LLVM_DEBUG(dbgs() << "LV: Not folding tail by masking due to "
+                           "interleaving: vectorize with a scalar epilogue "
+                           "instead.\n");
+      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+      return computeFeasibleMaxVF(TC, UserVF, false);
+    }
+
     // Note: There is no need to invalidate any cost modeling decisions here, as
     // non where taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-interleave.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-interleave.ll
@@ -0,0 +1,68 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN:   -debug < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck --check-prefix=DEBUG %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; DEBUG: Not folding tail by masking due to interleaving: vectorize with a scalar epilogue instead.
+
+define void @foo(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 -2
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[STRIDED_VEC]], <4 x float> [[STRIDED_VEC1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float undef, float undef, float undef, float undef>, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x float> [[TMP9]], <12 x float> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:    store <12 x float> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %mul = shl nuw nsw i64 %i.021, 1
+  %arrayidx = getelementptr inbounds float, ptr %src, i64 %mul
+  %0 = load float, ptr %arrayidx, align 4
+  %mul1 = mul nuw nsw i64 %i.021, 3
+  %arrayidx2 = getelementptr inbounds float, ptr %dst, i64 %mul1
+  store float %0, ptr %arrayidx2, align 4
+  %add = or i64 %mul, 1
+  %arrayidx4 = getelementptr inbounds float, ptr %src, i64 %add
+  %1 = load float, ptr %arrayidx4, align 4
+  %add6 = add nuw nsw i64 %mul1, 1
+  %arrayidx7 = getelementptr inbounds float, ptr %dst, i64 %add6
+  store float %1, ptr %arrayidx7, align 4
+  %add9 = add nuw nsw i64 %mul1, 2
+  %arrayidx10 = getelementptr inbounds float, ptr %dst, i64 %add9
+  store float 3.000000e+00, ptr %arrayidx10, align 4
+  %inc = add nuw nsw i64 %i.021, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
Index: llvm/test/Transforms/LoopVectorize/tail-folding-interleave.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/tail-folding-interleave.ll
@@ -0,0 +1,33 @@
+; RUN: opt -loop-vectorize -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN:   -enable-interleaved-mem-accesses -debug -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: Not folding tail by masking due to interleaving: vectorize with a scalar epilogue instead.
+
+define void @foo(ptr noalias %dst, ptr noalias %src, i64 noundef %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %mul = shl nuw nsw i64 %i.021, 1
+  %arrayidx = getelementptr inbounds float, ptr %src, i64 %mul
+  %0 = load float, ptr %arrayidx, align 4
+  %mul1 = mul nuw nsw i64 %i.021, 3
+  %arrayidx2 = getelementptr inbounds float, ptr %dst, i64 %mul1
+  store float %0, ptr %arrayidx2, align 4
+  %add = or i64 %mul, 1
+  %arrayidx4 = getelementptr inbounds float, ptr %src, i64 %add
+  %1 = load float, ptr %arrayidx4, align 4
+  %add6 = add nuw nsw i64 %mul1, 1
+  %arrayidx7 = getelementptr inbounds float, ptr %dst, i64 %add6
+  store float %1, ptr %arrayidx7, align 4
+  %add9 = add nuw nsw i64 %mul1, 2
+  %arrayidx10 = getelementptr inbounds float, ptr %dst, i64 %add9
+  store float 3.000000e+00, ptr %arrayidx10, align 4
+  %inc = add nuw nsw i64 %i.021, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}