Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -581,13 +581,28 @@
                                 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo) const;
 
+  /// The Preferred Vectorization Predication scheme of the target for the given
+  /// loop.
+  /// - ScalarEpilogue: Don't attempt to predicate the vector body (or epilog)
+  /// - PredicateElseEpilogue: Attempt to predicate the body else fall back to
+  ///   unpredicated.
+  /// - PredicateOrDontVectorize: Attempt to predicate the body else don't
+  ///   vectorize.
+  /// - UsePredicatedEpilogue: Use an unpredicated vector body with a predicated
+  ///   remainder.
+  enum class PreferPredicateTy {
+    ScalarEpilogue,
+    PredicateElseEpilogue,
+    PredicateOrDontVectorize,
+    UsePredicatedEpilogue,
+  };
+
   /// Query the target whether it would be prefered to create a predicated
   /// vector loop, which can avoid the need to emit a scalar epilogue loop.
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
-                                   DominatorTree *DT,
-                                   LoopVectorizationLegality *LVL,
-                                   InterleavedAccessInfo *IAI) const;
+  PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI) const;
 
   /// Query the target what the preferred style of tail folding is.
   /// \param IVUpdateMayOverflow Tells whether it is known if the IV update
@@ -1700,15 +1715,14 @@
                                         AssumptionCache &AC,
                                         TargetLibraryInfo *LibInfo,
                                         HardwareLoopInfo &HWLoopInfo) = 0;
-  virtual bool
-  preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                              AssumptionCache &AC, TargetLibraryInfo *TLI,
-                              DominatorTree *DT, LoopVectorizationLegality *LVL,
-                              InterleavedAccessInfo *IAI) = 0;
+  virtual PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI) = 0;
   virtual TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
-  virtual std::optional<Instruction *> instCombineIntrinsic(
-      InstCombiner &IC, IntrinsicInst &II) = 0;
+  virtual std::optional<Instruction *>
+  instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0;
   virtual std::optional<Value *> simplifyDemandedUseBitsIntrinsic(
       InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask,
       KnownBits & Known, bool &KnownBitsComputed) = 0;
@@ -2106,12 +2120,11 @@
                                 HardwareLoopInfo &HWLoopInfo) override {
     return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
   }
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
-                                   DominatorTree *DT,
-                                   LoopVectorizationLegality *LVL,
-                                   InterleavedAccessInfo *IAI) override {
-    return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI);
+  TTI::PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI) override {
+    return Impl.getPreferredVectorPredication(L, LI, SE, AC, TLI, DT, LVL, IAI);
   }
   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) override {
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -163,12 +163,11 @@
     return false;
   }
 
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
-                                   DominatorTree *DT,
-                                   LoopVectorizationLegality *LVL,
-                                   InterleavedAccessInfo *IAI) const {
-    return false;
+  TTI::PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI) const {
+    return TTI::PreferPredicateTy::ScalarEpilogue;
   }
 
   TailFoldingStyle
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -622,12 +622,12 @@
     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
   }
 
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
-                                   DominatorTree *DT,
-                                   LoopVectorizationLegality *LVL,
-                                   InterleavedAccessInfo *IAI) {
-    return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI);
+  TTI::PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI) {
+    return BaseT::getPreferredVectorPredication(L, LI, SE, AC, TLI, DT, LVL,
+                                                IAI);
   }
 
   TailFoldingStyle
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -313,11 +313,12 @@
   return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
 }
 
-bool TargetTransformInfo::preferPredicateOverEpilogue(
+TTI::PreferPredicateTy TargetTransformInfo::getPreferredVectorPredication(
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
     InterleavedAccessInfo *IAI) const {
-  return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI);
+  return TTIImpl->getPreferredVectorPredication(L, LI, SE, AC, TLI, DT, LVL,
+                                                IAI);
 }
 
 TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle(
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -356,11 +356,10 @@
     return TailFoldingStyle::DataWithoutLaneMask;
   }
 
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
-                                   DominatorTree *DT,
-                                   LoopVectorizationLegality *LVL,
-                                   InterleavedAccessInfo *IAI);
+  TTI::PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI);
 
   bool supportsScalableVectors() const { return ST->hasSVE(); }
 
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3390,18 +3390,18 @@
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
 }
 
-bool AArch64TTIImpl::preferPredicateOverEpilogue(
+TTI::PreferPredicateTy AArch64TTIImpl::getPreferredVectorPredication(
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
     InterleavedAccessInfo *IAI) {
   if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
 
   // We don't currently support vectorisation with interleaving for SVE - with
   // such loops we're better off not using tail-folding. This gives us a chance
   // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
   if (IAI->hasGroups())
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
 
   TailFoldingKind Required; // Defaults to 0.
   if (LVL->getReductionVars().size())
@@ -3411,7 +3411,9 @@
   if (!Required)
     Required.add(TailFoldingKind::TFSimple);
 
-  return (TailFoldingKindLoc & Required) == Required;
+  return (TailFoldingKindLoc & Required) == Required
+             ? TTI::PreferPredicateTy::PredicateElseEpilogue
+             : TTI::PreferPredicateTy::ScalarEpilogue;
 }
 
 InstructionCost
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -300,14 +300,12 @@
   bool maybeLoweredToCall(Instruction &I);
   bool isLoweredToCall(const Function *F);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
-                                AssumptionCache &AC,
-                                TargetLibraryInfo *LibInfo,
+                                AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo);
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
-                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
-                                   DominatorTree *DT,
-                                   LoopVectorizationLegality *LVL,
-                                   InterleavedAccessInfo *IAI);
+  TTI::PreferPredicateTy getPreferredVectorPredication(
+      Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+      TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
+      InterleavedAccessInfo *IAI);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
                                OptimizationRemarkEmitter *ORE);
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2238,52 +2238,55 @@
   return true;
 }
 
-bool ARMTTIImpl::preferPredicateOverEpilogue(
+TTI::PreferPredicateTy ARMTTIImpl::getPreferredVectorPredication(
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL,
     InterleavedAccessInfo *IAI) {
   if (!EnableTailPredication) {
     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
   }
 
   // Creating a predicated vector loop is the first step for generating a
   // tail-predicated hardware loop, for which we need the MVE masked
   // load/stores instructions:
   if (!ST->hasMVEIntegerOps())
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
 
   // For now, restrict this to single block loops.
   if (L->getNumBlocks() > 1) {
-    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
+    LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: not a single block "
                          "loop.\n");
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
   }
 
-  assert(L->isInnermost() && "preferPredicateOverEpilogue: inner-loop expected");
+  assert(L->isInnermost() &&
+         "getPreferredVectorPredication: inner-loop expected");
 
   HardwareLoopInfo HWLoopInfo(L);
   if (!HWLoopInfo.canAnalyze(*LI)) {
-    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+    LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: hardware-loop is not "
                          "analyzable.\n");
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
   }
 
   // This checks if we have the low-overhead branch architecture
   // extension, and if we will create a hardware-loop:
   if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) {
-    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+    LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: hardware-loop is not "
                          "profitable.\n");
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
   }
 
   if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) {
-    LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
+    LLVM_DEBUG(dbgs() << "getPreferredVectorPredication: hardware-loop is not "
                          "a candidate.\n");
-    return false;
+    return TTI::PreferPredicateTy::ScalarEpilogue;
   }
 
-  return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
+  if (!canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()))
+    return TTI::PreferPredicateTy::ScalarEpilogue;
+  return TTI::PreferPredicateTy::PredicateElseEpilogue;
 }
 
 TailFoldingStyle
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -201,36 +201,32 @@
     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
     cl::desc("The maximum allowed number of runtime memory checks"));
 
-// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
-// that predication is preferred, and this lists all options. I.e., the
-// vectorizer will try to fold the tail-loop (epilogue) into the vector body
+// Option prefer-predicate-over-epilogue indicates that an epilogue is
+// undesired, that predication is preferred, and this lists all options. I.e.,
+// the vectorizer will try to fold the tail-loop (epilogue) into the vector body
 // and predicate the instructions accordingly. If tail-folding fails, there are
 // different fallback strategies depending on these values:
-namespace PreferPredicateTy {
-  enum Option {
-    ScalarEpilogue = 0,
-    PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize
-  };
-} // namespace PreferPredicateTy
 
-static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
+static cl::opt<TTI::PreferPredicateTy> PreferPredicateOverEpilogue(
     "prefer-predicate-over-epilogue",
-    cl::init(PreferPredicateTy::ScalarEpilogue),
-    cl::Hidden,
+    cl::init(TTI::PreferPredicateTy::ScalarEpilogue), cl::Hidden,
     cl::desc("Tail-folding and predication preferences over creating a scalar "
              "epilogue loop."),
-    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
-                         "scalar-epilogue",
-                         "Don't tail-predicate loops, create scalar epilogue"),
-              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
-                         "predicate-else-scalar-epilogue",
-                         "prefer tail-folding, create scalar epilogue if tail "
-                         "folding fails."),
-              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
-                         "predicate-dont-vectorize",
-                         "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails.")));
+    cl::values(
+        clEnumValN(TTI::PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+                   "Don't tail-predicate loops, create scalar epilogue"),
+        clEnumValN(
+            TTI::PreferPredicateTy::PredicateElseEpilogue,
+            "predicate-else-scalar-epilogue",
+            "prefer tail-folding, create epilogue if tail folding fails."),
+        clEnumValN(TTI::PreferPredicateTy::PredicateOrDontVectorize,
+                   "predicate-dont-vectorize",
+                   "prefers tail-folding, don't attempt vectorization if "
+                   "tail-folding fails."),
+        clEnumValN(TTI::PreferPredicateTy::UsePredicatedEpilogue,
+                   "use-predicated-epilogue",
+                   "Don't tail-predicate the loops but allow a predicated "
+                   "epilogue.")));
 
 static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
     "force-tail-folding-style", cl::desc("Force the tail folding style"),
@@ -1141,27 +1137,32 @@
 
 namespace llvm {
 
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// Loop vectorization cost-model hints how the loop and epilogues should be
 // lowered.
 enum ScalarEpilogueLowering {
-
-  // The default: allowing scalar epilogues.
+  // The default: allowing epilogues but don't attempt to predicate the vector
+  // body (FoldTailByMasking) or the remainder.
   CM_ScalarEpilogueAllowed,
 
   // Vectorization with OptForSize: don't allow epilogues.
   CM_ScalarEpilogueNotAllowedOptSize,
 
-  // A special case of vectorisation with OptForSize: loops with a very small
-  // trip count are considered for vectorization under OptForSize, thereby
-  // making sure the cost of their loop body is dominant, free of runtime
-  // guards and scalar iteration overheads.
+  // The same as CM_ScalarEpilogueNotAllowedOptSize but for a different reason
+  // (the trip count is known to be low) and the reported remarks are different.
   CM_ScalarEpilogueNotAllowedLowTripLoop,
 
-  // Loop hint predicate indicating an epilogue is undesired.
+  // Loop hint 'predicate' indicating predicating the entire loop might be
+  // profitable. Picks between predicated and unpredicated plans based on which
+  // is lower cost. The remainder can be predicated or scalar if required.
   CM_ScalarEpilogueNotNeededUsePredicate,
 
   // Directive indicating we must either tail fold or not vectorize
-  CM_ScalarEpilogueNotAllowedUsePredicate
+  CM_ScalarEpilogueNotAllowedUsePredicate,
+
+  // An unpredicated main loop like CM_ScalarEpilogueAllowed, but allow
+  // predicated
+  // remainders if profitable.
+  CM_ScalarEpilogueAllowedUsePredRemainder,
 };
 
 /// ElementCountComparator creates a total ordering for ElementCount
@@ -1555,7 +1556,8 @@
   bool isScalarEpilogueAllowed() const {
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
            (!FoldTailByMasking &&
-            ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate);
+            (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+             ScalarEpilogueStatus == CM_ScalarEpilogueAllowedUsePredRemainder));
   }
 
   /// Returns the TailFoldingStyle that is best for the current loop.
@@ -1580,6 +1582,13 @@
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// Return true when a predicated main loop is enabled, which is when
+  /// ScalarEpilogueStatus is not CM_ScalarEpilogueAllowedUsePredRemainder.
+  bool allowPredicatedVectorBody() const {
+    return FoldTailByMasking &&
+           ScalarEpilogueStatus != CM_ScalarEpilogueAllowedUsePredRemainder;
+  }
+
   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
   /// nodes to the chain of instructions representing the reductions. Uses a
   /// MapVector to ensure deterministic iteration order.
@@ -5085,6 +5094,13 @@
     LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n"
                       << "LV: Trying predicated vector loop.\n");
     break;
+  case CM_ScalarEpilogueAllowedUsePredRemainder:
+    // If this cost model is for unpredicated plans then generate them so that
+    // the loop body can be unpredicated and the remainder predicated.
+    if (!FoldTailByMasking)
+      return computeFeasibleMaxVF(TC, UserVF, false);
+    LLVM_DEBUG(dbgs() << "LV: Allowing predication for remainder loops.\n");
+    break;
   case CM_ScalarEpilogueNotAllowedLowTripLoop:
     // fallthrough as a special case of OptForSize
   case CM_ScalarEpilogueNotAllowedOptSize:
@@ -5523,7 +5539,9 @@
       if (isMoreProfitable(Candidate, ScalarCost))
         ProfitableVFs.push_back(Candidate);
 
-      if (isMoreProfitable(Candidate, ChosenFactor))
+      if ((!Candidate.FoldTailByMasking ||
+           VPlan->getCostModel()->allowPredicatedVectorBody()) &&
+          isMoreProfitable(Candidate, ChosenFactor))
         ChosenFactor = Candidate;
     }
   }
@@ -5624,6 +5642,8 @@
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
     if (hasPlanWithVF(ForcedEC, false))
       return {ForcedEC, false, 0, 0};
+    else if (hasPlanWithVF(ForcedEC, true))
+      return {ForcedEC, true, 0, 0};
     else {
       LLVM_DEBUG(
           dbgs()
@@ -9787,12 +9807,14 @@
   // 2) If set, obey the directives
   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
     switch (PreferPredicateOverEpilogue) {
-    case PreferPredicateTy::ScalarEpilogue:
+    case TTI::PreferPredicateTy::ScalarEpilogue:
       return CM_ScalarEpilogueAllowed;
-    case PreferPredicateTy::PredicateElseScalarEpilogue:
+    case TTI::PreferPredicateTy::PredicateElseEpilogue:
       return CM_ScalarEpilogueNotNeededUsePredicate;
-    case PreferPredicateTy::PredicateOrDontVectorize:
+    case TTI::PreferPredicateTy::PredicateOrDontVectorize:
       return CM_ScalarEpilogueNotAllowedUsePredicate;
+    case TTI::PreferPredicateTy::UsePredicatedEpilogue:
+      return CM_ScalarEpilogueAllowedUsePredRemainder;
     };
   }
 
@@ -9804,11 +9826,29 @@
     return CM_ScalarEpilogueAllowed;
   };
 
-  // 4) if the TTI hook indicates this is profitable, request predication.
-  if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI))
+  // 4) if the TTI hook to get the preferred option for for the target, which
+  // defaults to CM_ScalarEpilogueAllowed.
+  TTI::PreferPredicateTy TargetPreferredPredication =
+      TTI->getPreferredVectorPredication(L, LI, *SE, *AC, TLI, DT, &LVL, IAI);
+  switch (TargetPreferredPredication) {
+  default:
+    llvm_unreachable("Unexpected PreferPredicateTy returned from the target!");
+  case TTI::PreferPredicateTy::ScalarEpilogue:
+    LLVM_DEBUG(dbgs() << "LV: Target has picked ScalarEpilogue Predication\n");
+    return CM_ScalarEpilogueAllowed;
+  case TTI::PreferPredicateTy::PredicateElseEpilogue:
+    LLVM_DEBUG(
+        dbgs() << "LV: Target has picked PredicateElseEpilogue Predication\n");
     return CM_ScalarEpilogueNotNeededUsePredicate;
-
-  return CM_ScalarEpilogueAllowed;
+  case TTI::PreferPredicateTy::PredicateOrDontVectorize:
+    LLVM_DEBUG(dbgs() << "LV: Target has picked PredicateOrDontVectorize "
+                         "Predication\n");
+    return CM_ScalarEpilogueNotAllowedUsePredicate;
+  case TTI::PreferPredicateTy::UsePredicatedEpilogue:
+    LLVM_DEBUG(
+        dbgs() << "LV: Target has picked UsePredicatedEpilogue Predication\n");
+    return CM_ScalarEpilogueAllowedUsePredRemainder;
+  };
 }
 
 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
Index: llvm/test/Transforms/LoopVectorize/ARM/epilog-predicated.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/ARM/epilog-predicated.ll
@@ -0,0 +1,672 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,simplifycfg -S %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: opt -passes=loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck %s --check-prefix=PREDBODY
+; RUN: opt -passes=loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=use-predicated-epilogue -S %s | FileCheck %s --check-prefix=PREDEPI
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-eabi"
+
+; By default we should vectorize with a predicated body for these loops.
+
+define noundef i32 @add(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) #0 {
+; DEFAULT-LABEL: @add(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DEFAULT-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; DEFAULT-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP3]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT:       for.cond.cleanup:
+; DEFAULT-NEXT:    ret i32 0
+;
+; PREDBODY-LABEL: @add(
+; PREDBODY-NEXT:  entry:
+; PREDBODY-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDBODY-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDBODY:       vector.ph:
+; PREDBODY-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDBODY-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDBODY-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDBODY-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDBODY:       vector.body:
+; PREDBODY-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDBODY-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
+; PREDBODY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; PREDBODY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; PREDBODY-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDBODY-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; PREDBODY-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]]
+; PREDBODY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; PREDBODY-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP3]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; PREDBODY-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; PREDBODY-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDBODY-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDBODY:       for.cond.cleanup:
+; PREDBODY-NEXT:    ret i32 0
+;
+; PREDEPI-LABEL: @add(
+; PREDEPI-NEXT:  entry:
+; PREDEPI-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDEPI-NEXT:    br i1 [[CMP6]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDEPI:       vector.main.loop.iter.check:
+; PREDEPI-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; PREDEPI-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDEPI:       vector.ph:
+; PREDEPI-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; PREDEPI-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; PREDEPI-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDEPI:       vector.body:
+; PREDEPI-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDEPI-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; PREDEPI-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; PREDEPI-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; PREDEPI-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; PREDEPI-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]]
+; PREDEPI-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; PREDEPI-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP5]], align 4
+; PREDEPI-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PREDEPI-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDEPI:       middle.block:
+; PREDEPI-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]]
+; PREDEPI:       vec.epilog.ph:
+; PREDEPI-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; PREDEPI-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDEPI-NEXT:    [[N_MOD_VF1:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDEPI-NEXT:    [[N_VEC2:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF1]]
+; PREDEPI-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; PREDEPI:       vec.epilog.vector.body:
+; PREDEPI-NEXT:    [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP7:%.*]] = add i32 [[INDEX3]], 0
+; PREDEPI-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP7]], i32 [[N]])
+; PREDEPI-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP7]]
+; PREDEPI-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; PREDEPI-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDEPI-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; PREDEPI-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[TMP7]]
+; PREDEPI-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; PREDEPI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP10]], ptr [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; PREDEPI-NEXT:    [[INDEX_NEXT4]] = add i32 [[INDEX3]], 4
+; PREDEPI-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT4]], [[N_VEC2]]
+; PREDEPI-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; PREDEPI:       for.cond.cleanup:
+; PREDEPI-NEXT:    ret i32 0
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, ptr %y, i32 %indvars.iv
+  store i32 %add, ptr %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond.not = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 0
+}
+
+define noundef i32 @interleave(ptr nocapture noundef readonly %x, ptr noalias nocapture noundef writeonly %y, i32 noundef %n) #0 {
+; DEFAULT-LABEL: @interleave(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DEFAULT-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DEFAULT:       for.body.preheader:
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; DEFAULT-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; DEFAULT-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; DEFAULT-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP8]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = or i32 [[TMP8]], 1
+; DEFAULT-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP10]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; DEFAULT-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]]
+; DEFAULT-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[INDVARS_IV]]
+; DEFAULT-NEXT:    store i32 [[MUL4]], ptr [[ARRAYIDX6]], align 4
+; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT:       for.cond.cleanup:
+; DEFAULT-NEXT:    ret i32 0
+;
+; PREDBODY-LABEL: @interleave(
+; PREDBODY-NEXT:  entry:
+; PREDBODY-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDBODY-NEXT:    br i1 [[CMP12]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDBODY:       vector.main.loop.iter.check:
+; PREDBODY-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; PREDBODY-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDBODY:       vector.ph:
+; PREDBODY-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; PREDBODY-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; PREDBODY-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDBODY:       vector.body:
+; PREDBODY-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDBODY-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1
+; PREDBODY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]]
+; PREDBODY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; PREDBODY-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; PREDBODY-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; PREDBODY-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; PREDBODY-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; PREDBODY-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]]
+; PREDBODY-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; PREDBODY-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; PREDBODY-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PREDBODY-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDBODY-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; PREDBODY:       middle.block:
+; PREDBODY-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; PREDBODY-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]]
+; PREDBODY:       vec.epilog.ph:
+; PREDBODY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; PREDBODY-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDBODY-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDBODY-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; PREDBODY-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; PREDBODY-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PREDBODY-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; PREDBODY-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; PREDBODY:       vec.epilog.vector.body:
+; PREDBODY-NEXT:    [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX5]], 0
+; PREDBODY-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]])
+; PREDBODY-NEXT:    [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1>
+; PREDBODY-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]]
+; PREDBODY-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDBODY-NEXT:    [[TMP11:%.*]] = or <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; PREDBODY-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]]
+; PREDBODY-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDBODY-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER6]], [[WIDE_MASKED_GATHER]]
+; PREDBODY-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[TMP8]]
+; PREDBODY-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; PREDBODY-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP13]], ptr [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; PREDBODY-NEXT:    [[INDEX_NEXT7]] = add i32 [[INDEX5]], 4
+; PREDBODY-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PREDBODY-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT7]], [[N_VEC3]]
+; PREDBODY-NEXT:    br i1 [[TMP16]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDBODY:       for.cond.cleanup:
+; PREDBODY-NEXT:    ret i32 0
+;
+; PREDEPI-LABEL: @interleave(
+; PREDEPI-NEXT:  entry:
+; PREDEPI-NEXT:    [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDEPI-NEXT:    br i1 [[CMP12]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDEPI:       vector.main.loop.iter.check:
+; PREDEPI-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; PREDEPI-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDEPI:       vector.ph:
+; PREDEPI-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; PREDEPI-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; PREDEPI-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDEPI:       vector.body:
+; PREDEPI-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDEPI-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1
+; PREDEPI-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]]
+; PREDEPI-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; PREDEPI-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; PREDEPI-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; PREDEPI-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; PREDEPI-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; PREDEPI-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[Y:%.*]], i32 [[TMP0]]
+; PREDEPI-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; PREDEPI-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP6]], align 4
+; PREDEPI-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PREDEPI-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDEPI:       middle.block:
+; PREDEPI-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]]
+; PREDEPI:       vec.epilog.ph:
+; PREDEPI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; PREDEPI-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDEPI-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDEPI-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; PREDEPI-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; PREDEPI-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PREDEPI-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; PREDEPI-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; PREDEPI:       vec.epilog.vector.body:
+; PREDEPI-NEXT:    [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX5]], 0
+; PREDEPI-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]])
+; PREDEPI-NEXT:    [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1>
+; PREDEPI-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]]
+; PREDEPI-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDEPI-NEXT:    [[TMP11:%.*]] = or <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; PREDEPI-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]]
+; PREDEPI-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDEPI-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER6]], [[WIDE_MASKED_GATHER]]
+; PREDEPI-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[Y]], i32 [[TMP8]]
+; PREDEPI-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; PREDEPI-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP13]], ptr [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; PREDEPI-NEXT:    [[INDEX_NEXT7]] = add i32 [[INDEX5]], 4
+; PREDEPI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PREDEPI-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT7]], [[N_VEC3]]
+; PREDEPI-NEXT:    br i1 [[TMP16]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDEPI:       for.cond.cleanup:
+; PREDEPI-NEXT:    ret i32 0
+;
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nuw nsw i32 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %0
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = or i32 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, ptr %x, i32 %2
+  %3 = load i32, ptr %arrayidx3, align 4
+  %mul4 = mul nsw i32 %3, %1
+  %arrayidx6 = getelementptr inbounds i32, ptr %y, i32 %indvars.iv
+  store i32 %mul4, ptr %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond.not = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret i32 0
+}
+
+define noundef i32 @reduce_add(ptr nocapture noundef readonly %x, i32 noundef %n) #0 {
+; DEFAULT-LABEL: @reduce_add(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DEFAULT-NEXT:    br i1 [[CMP4]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; DEFAULT-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; DEFAULT-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; DEFAULT:       for.cond.cleanup:
+; DEFAULT-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+; PREDBODY-LABEL: @reduce_add(
+; PREDBODY-NEXT:  entry:
+; PREDBODY-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDBODY-NEXT:    br i1 [[CMP4]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDBODY:       vector.ph:
+; PREDBODY-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDBODY-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDBODY-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; PREDBODY-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDBODY:       vector.body:
+; PREDBODY-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDBODY-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
+; PREDBODY-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; PREDBODY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; PREDBODY-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDBODY-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; PREDBODY-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
+; PREDBODY-NEXT:    [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]]
+; PREDBODY-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; PREDBODY-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDBODY-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDBODY:       for.cond.cleanup:
+; PREDBODY-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+; PREDEPI-LABEL: @reduce_add(
+; PREDEPI-NEXT:  entry:
+; PREDEPI-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDEPI-NEXT:    br i1 [[CMP4]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDEPI:       vector.main.loop.iter.check:
+; PREDEPI-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; PREDEPI-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDEPI:       vector.ph:
+; PREDEPI-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; PREDEPI-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; PREDEPI-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDEPI:       vector.body:
+; PREDEPI-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDEPI-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP0]]
+; PREDEPI-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; PREDEPI-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
+; PREDEPI-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
+; PREDEPI-NEXT:    [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]]
+; PREDEPI-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PREDEPI-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDEPI:       middle.block:
+; PREDEPI-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]]
+; PREDEPI:       vec.epilog.ph:
+; PREDEPI-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
+; PREDEPI-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; PREDEPI-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDEPI-NEXT:    [[N_MOD_VF1:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDEPI-NEXT:    [[N_VEC2:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF1]]
+; PREDEPI-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; PREDEPI:       vec.epilog.vector.body:
+; PREDEPI-NEXT:    [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[VEC_PHI4:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX3]], 0
+; PREDEPI-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP6]], i32 [[N]])
+; PREDEPI-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP6]]
+; PREDEPI-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; PREDEPI-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDEPI-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; PREDEPI-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
+; PREDEPI-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI4]]
+; PREDEPI-NEXT:    [[INDEX_NEXT5]] = add i32 [[INDEX3]], 4
+; PREDEPI-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT5]], [[N_VEC2]]
+; PREDEPI-NEXT:    br i1 [[TMP12]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; PREDEPI:       for.cond.cleanup:
+; PREDEPI-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.05 = phi i32 [ 0, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %0, %s.05
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond.not = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %s.0.lcssa
+}
+
+
+define noundef i32 @reduce_interleave(ptr nocapture noundef readonly %x, i32 noundef %n) #0 {
+; DEFAULT-LABEL: @reduce_interleave(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; DEFAULT-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; DEFAULT:       for.body.preheader:
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; DEFAULT-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; DEFAULT-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; DEFAULT-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; DEFAULT-NEXT:    [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; DEFAULT-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[S_012:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD5:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP8]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; DEFAULT-NEXT:    [[TMP10:%.*]] = or i32 [[TMP8]], 1
+; DEFAULT-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP10]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; DEFAULT-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[TMP11]], [[TMP9]]
+; DEFAULT-NEXT:    [[ADD5]] = add nsw i32 [[MUL4]], [[S_012]]
+; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       for.cond.cleanup:
+; DEFAULT-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; DEFAULT-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+; PREDBODY-LABEL: @reduce_interleave(
+; PREDBODY-NEXT:  entry:
+; PREDBODY-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDBODY-NEXT:    br i1 [[CMP11]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDBODY:       vector.main.loop.iter.check:
+; PREDBODY-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; PREDBODY-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDBODY:       vector.ph:
+; PREDBODY-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; PREDBODY-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; PREDBODY-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDBODY:       vector.body:
+; PREDBODY-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDBODY-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1
+; PREDBODY-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]]
+; PREDBODY-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; PREDBODY-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; PREDBODY-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; PREDBODY-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; PREDBODY-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; PREDBODY-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; PREDBODY-NEXT:    [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
+; PREDBODY-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PREDBODY-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDBODY-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDBODY:       middle.block:
+; PREDBODY-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; PREDBODY-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]]
+; PREDBODY:       vec.epilog.ph:
+; PREDBODY-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; PREDBODY-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; PREDBODY-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDBODY-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDBODY-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; PREDBODY-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; PREDBODY-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PREDBODY-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; PREDBODY-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; PREDBODY:       vec.epilog.vector.body:
+; PREDBODY-NEXT:    [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[VEC_PHI6:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDBODY-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX5]], 0
+; PREDBODY-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]])
+; PREDBODY-NEXT:    [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1>
+; PREDBODY-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]]
+; PREDBODY-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDBODY-NEXT:    [[TMP11:%.*]] = or <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; PREDBODY-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]]
+; PREDBODY-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDBODY-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[WIDE_MASKED_GATHER]]
+; PREDBODY-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer
+; PREDBODY-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; PREDBODY-NEXT:    [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI6]]
+; PREDBODY-NEXT:    [[INDEX_NEXT8]] = add i32 [[INDEX5]], 4
+; PREDBODY-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PREDBODY-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT8]], [[N_VEC3]]
+; PREDBODY-NEXT:    br i1 [[TMP17]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; PREDBODY:       for.cond.cleanup:
+; PREDBODY-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDBODY-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+; PREDEPI-LABEL: @reduce_interleave(
+; PREDEPI-NEXT:  entry:
+; PREDEPI-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; PREDEPI-NEXT:    br i1 [[CMP11]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; PREDEPI:       vector.main.loop.iter.check:
+; PREDEPI-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; PREDEPI-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; PREDEPI:       vector.ph:
+; PREDEPI-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; PREDEPI-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; PREDEPI-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PREDEPI:       vector.body:
+; PREDEPI-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; PREDEPI-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[TMP0]], 1
+; PREDEPI-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP1]]
+; PREDEPI-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; PREDEPI-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4
+; PREDEPI-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; PREDEPI-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; PREDEPI-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]]
+; PREDEPI-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; PREDEPI-NEXT:    [[TMP6]] = add i32 [[TMP5]], [[VEC_PHI]]
+; PREDEPI-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; PREDEPI-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDEPI:       middle.block:
+; PREDEPI-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; PREDEPI-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_PH]]
+; PREDEPI:       vec.epilog.ph:
+; PREDEPI-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; PREDEPI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; PREDEPI-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; PREDEPI-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4
+; PREDEPI-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; PREDEPI-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
+; PREDEPI-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; PREDEPI-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; PREDEPI-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; PREDEPI:       vec.epilog.vector.body:
+; PREDEPI-NEXT:    [[INDEX5:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[VEC_PHI6:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    [[TMP8:%.*]] = add i32 [[INDEX5]], 0
+; PREDEPI-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP8]], i32 [[N]])
+; PREDEPI-NEXT:    [[TMP9:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1>
+; PREDEPI-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP9]]
+; PREDEPI-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDEPI-NEXT:    [[TMP11:%.*]] = or <4 x i32> [[TMP9]], <i32 1, i32 1, i32 1, i32 1>
+; PREDEPI-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[X]], <4 x i32> [[TMP11]]
+; PREDEPI-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; PREDEPI-NEXT:    [[TMP13:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[WIDE_MASKED_GATHER]]
+; PREDEPI-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer
+; PREDEPI-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP14]])
+; PREDEPI-NEXT:    [[TMP16]] = add i32 [[TMP15]], [[VEC_PHI6]]
+; PREDEPI-NEXT:    [[INDEX_NEXT8]] = add i32 [[INDEX5]], 4
+; PREDEPI-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; PREDEPI-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT8]], [[N_VEC3]]
+; PREDEPI-NEXT:    br i1 [[TMP17]], label [[FOR_COND_CLEANUP]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PREDEPI:       for.cond.cleanup:
+; PREDEPI-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; PREDEPI-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.012 = phi i32 [ 0, %for.body.preheader ], [ %add5, %for.body ]
+  %0 = shl nuw nsw i32 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %0
+  %1 = load i32, ptr %arrayidx, align 4
+  %2 = or i32 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, ptr %x, i32 %2
+  %3 = load i32, ptr %arrayidx3, align 4
+  %mul4 = mul nsw i32 %3, %1
+  %add5 = add nsw i32 %mul4, %s.012
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond.not = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add5, %for.body ]
+  ret i32 %s.0.lcssa
+}
+
+attributes #0 = { "target-features"="+mve" }
Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -18,6 +18,7 @@
 ; CHECK-NEXT:  LV: Found an induction variable.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
+; CHECK-NEXT:  LV: Target has picked ScalarEpilogue Predication
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.
@@ -151,6 +152,7 @@
 ; CHECK-NEXT:  LV: Found FP op with unsafe algebra.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
 ; CHECK-NEXT:  LV: We can vectorize this loop (with a runtime bound check)!
+; CHECK-NEXT:  LV: Target has picked ScalarEpilogue Predication
 ; CHECK-NEXT:  LV: Found trip count: 0
 ; CHECK-NEXT:  LV: Scalable vectorization is available
 ; CHECK-NEXT:  LV: The max safe fixed VF is: 67108864.