Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -296,7 +296,18 @@
   VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
 
   /// Finalize the best decision and dispose of all other VPlans.
-  void setBestPlan(ElementCount VF, unsigned UF);
+  void setBestPlan(ElementCount VF, unsigned UF,
+                   SmallVectorImpl<VPlanPtr> *BackupPlans = nullptr);
+
+  /// Add in extra plans, for example any that may have been saved when called
+  /// setBestPlan above.
+  void addVPlans(SmallVectorImpl<VPlanPtr> &ExtraVPlans) {
+    for (auto &I : ExtraVPlans)
+      VPlans.push_back(std::unique_ptr<llvm::VPlan>(I.release()));
+
+    // Since we've released the pointer we should clear the entries too.
+    ExtraVPlans.clear();
+  }
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected VPlan.
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6229,15 +6229,6 @@
     return Result;
   }
 
-  // FIXME: This can be fixed for scalable vectors later, because at this stage
-  // the LoopVectorizer will only consider vectorizing a loop with scalable
-  // vectors when the loop has a hint to enable vectorization for a given VF.
-  if (MainLoopVF.isScalable()) {
-    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
-                         "yet supported.\n");
-    return Result;
-  }
-
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
@@ -6249,9 +6240,10 @@
 
   if (EpilogueVectorizationForceVF > 1) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
-    if (LVP.hasPlanWithVFs(
-            {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
-      return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
+    ElementCount ForcedEC =
+        ElementCount::getFixed(EpilogueVectorizationForceVF);
+    if (LVP.hasPlanWithVFs({ForcedEC}))
+      return {ForcedEC, 0};
     else {
       LLVM_DEBUG(
           dbgs()
@@ -6268,14 +6260,27 @@
     return Result;
   }
 
-  if (!isEpilogueVectorizationProfitable(MainLoopVF))
+  ElementCount FixedMainLoopVF = MainLoopVF;
+  if (FixedMainLoopVF.isScalable()) {
+    FixedMainLoopVF =
+        ElementCount::getFixed(FixedMainLoopVF.getKnownMinValue());
+    LLVM_DEBUG(
+        dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
+                  "yet supported. Converting to fixed-width (VF="
+               << FixedMainLoopVF << ") instead\n");
+  }
+
+  if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
+                         "this loop\n");
     return Result;
+  }
 
   for (auto &NextVF : ProfitableVFs)
-    if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
+    if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
         (Result.Width.getFixedValue() == 1 ||
          isMoreProfitable(NextVF, Result)) &&
-        LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
+        LVP.hasPlanWithVFs({NextVF.Width}))
       Result = NextVF;
 
   if (Result != VectorizationFactor::Disabled())
@@ -8182,15 +8187,24 @@
   return SelectedVF;
 }
 
-void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
+void LoopVectorizationPlanner::setBestPlan(
+    ElementCount VF, unsigned UF, SmallVectorImpl<VPlanPtr> *BackupPlans) {
   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
                     << '\n');
   BestVF = VF;
   BestUF = UF;
 
+  if (BackupPlans) {
+    for (auto &I : VPlans) {
+      if (!I.get()->hasVF(VF))
+        BackupPlans->push_back(std::unique_ptr<llvm::VPlan>(I.release()));
+    }
+  }
+
   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
-    return !Plan->hasVF(VF);
+    return Plan.get() == nullptr || !Plan->hasVF(VF);
   });
+
   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
 }
 
@@ -8388,7 +8402,9 @@
   OldInduction = Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+
+  IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
+  Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   EPI.VectorTripCount = CountRoundDown;
   Induction =
@@ -10380,7 +10396,6 @@
                              F->getParent()->getDataLayout());
     if (!VF.Width.isScalar() || IC > 1)
       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
-    LVP.setBestPlan(VF.Width, IC);
 
     using namespace ore;
     if (!VectorizeLoop) {
@@ -10389,6 +10404,7 @@
       // interleave it.
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
+      LVP.setBestPlan(VF.Width, IC);
       LVP.executePlan(Unroller, DT);
 
       ORE->emit([&]() {
@@ -10411,8 +10427,8 @@
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
-
-        LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
+        SmallVector<VPlanPtr, 4> ExtraVPlans;
+        LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF, &ExtraVPlans);
         LVP.executePlan(MainILV, DT);
         ++LoopsVectorized;
 
@@ -10421,6 +10437,7 @@
 
         // Second pass vectorizes the epilogue and adjusts the control flow
         // edges from the first pass.
+        LVP.addVPlans(ExtraVPlans);
         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
         EPI.MainLoopVF = EPI.EpilogueVF;
         EPI.MainLoopUF = EPI.EpilogueUF;
@@ -10435,6 +10452,7 @@
       } else {
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                                &LVL, &CM, BFI, PSI, Checks);
+        LVP.setBestPlan(VF.Width, IC);
         LVP.executePlan(LB, DT);
         ++LoopsVectorized;
 
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -0,0 +1,59 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S -scalable-vectorization=preferred 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED
+
+target triple = "aarch64-linux-gnu"
+
+; DEBUG: LV: Checking a loop in "f1"
+; DEBUG: LEV: Epilogue vectorization using scalable vectors not yet supported. Converting to fixed-width (VF=16) instead
+; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
+; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+
+; DEBUG-FORCED: LV: Checking a loop in "f1"
+; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
+; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
+; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+
+define void @f1(i8* %A) #0 {
+; CHECK-LABEL: @f1(
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[NUM_ITS:%.*]] = mul i64 [[VSCALE]], 32
+; CHECK-NEXT:    [[MIN_IT_CHECK:%.*]] = icmp ult i64 1024, [[NUM_ITS]]
+; CHECK-NEXT:    br i1 [[MIN_IT_CHECK]], label %vec.epilog.ph, label %vector.ph
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[VSCALE1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VEC_ITS1:%.*]] = mul i64 [[VSCALE1]], {{.*}}
+; CHECK-NEXT:    [[VSCALE2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VEC_ITS2:%.*]] = mul i64 [[VSCALE2]], {{.*}}
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[VEC_ITS2]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NXT:%.*]], %vector.body ]
+; CHECK:         store <vscale x 16 x i8>
+; CHECK:         store <vscale x 16 x i8>
+; CHECK:         [[IDX_NXT]] = add nuw i64 [[IDX]], [[VEC_ITS1]]
+; CHECK-NEXT:    {{%.*}} = icmp eq i64 [[IDX_NXT]], [[N_VEC]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK:         store <8 x i8>
+; CHECK:       for.body:
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv
+  store i8 1, i8* %arrayidx, align 1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
+++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@@ -1,11 +1,12 @@
 ; REQUIRES: asserts
-; RUN: opt < %s  -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s
+; RUN: opt < %s  -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
 
 ; Currently we cannot handle scalable vectorization factors.
 ; CHECK: LV: Checking a loop in "f1"
-; CHECK: LEV: Epilogue vectorization for scalable vectors not yet supported.
+; CHECK: LEV: Epilogue vectorization factor is forced.
+; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1
 
 define void @f1(i8* %A) {
 entry: