diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2512,6 +2512,12 @@
                                          unsigned Index, unsigned FieldIndex,
                                          MDNode *DbgInfo);
 
+  /// Return an all true boolean vector of size and scalability \p NumElts.
+  Value *getTrueVector(ElementCount NumElts) {
+    VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts);
+    return Constant::getAllOnesValue(VTy);
+  }
+
 private:
   /// Helper function that creates an assume intrinsic call that
   /// represents an alignment assumption on the provided pointer \p PtrValue
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -75,6 +75,14 @@
                                       const APInt &Imm, Type *Ty,
                                       TTI::TargetCostKind CostKind);
 
+  /// \name Vector Predication Information
+  /// Whether the target supports the %evl parameter of VP intrinsic efficiently
+  /// in hardware, for the given opcode and type/alignment. (see LLVM Language
+  /// Reference - "Vector Predication Intrinsics",
+  /// https://llvm.org/docs/LangRef.html#vector-predication-intrinsics).
+  bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
+                             Align Alignment) const;
+
   TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   bool shouldExpandReduction(const IntrinsicInst *II) const;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -185,6 +185,19 @@
   return TTI::TCC_Free;
 }
 
+bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const {
+  if (!DataTy)
+    return ST->hasVInstructions();
+
+  if (DataTy->isFloatTy())
+    return ST->hasVInstructionsF32();
+  if (DataTy->isDoubleTy())
+    return ST->hasVInstructionsF64();
+
+  unsigned IntWidth = DataTy->getIntegerBitWidth();
+  return IntWidth <= ST->getELEN();
+}
+
 TargetTransformInfo::PopcntSupportKind
 RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -249,6 +249,44 @@
             "data-and-control-without-rt-check",
             "Similar to data-and-control, but remove the runtime check")));
 
+// Option prefer-predicate-with-vp-intrinsics is an experimental switch to
+// indicate that the loop vectorizer should try to generate VP intrinsics if
+// tail-folding is enabled (note that this option is dependent on the
+// prefer-predicate-over-epilogue option being set to predicate-dont-vectorize).
+// This can be particularly useful for targets like RISC-V and SX-Aurora that
+// support vector length predication.
+// Currently this switch takes three possible values:
+// 0. no-predication: Do not generate VP intrinsics.
+// 1. if-explicit-vector-length-supported: Only generate VP intrinsics if the
+// target supports explicit vector length based predication.
+// 2. force-explicit-vector-length-support: This is purely an
+// experimental/testing option which will be removed in future. It forces the
+// loop vectorizer to assume that the target supports vector length predication.
+namespace {
+enum class EVLOption {
+  NoPredication = 0,
+  IfEVLSupported,
+  ForceEVLSupport
+};
+} // namespace
+
+static cl::opt<EVLOption> PreferPredicateWithVPIntrinsics(
+    "prefer-predicate-with-vp-intrinsics", cl::init(EVLOption::NoPredication),
+    cl::Hidden,
+    cl::desc("When vectorizing with tail-folding, generate vector predication "
+             "intrinsics."),
+    cl::values(
+        clEnumValN(EVLOption::NoPredication, "no-predication",
+                   "Do not generate VP intrinsics."),
+        clEnumValN(EVLOption::IfEVLSupported,
+                   "if-explicit-vector-length-support",
+                   "Only generate VP intrinsics if the target supports vector "
+                   "length predication."),
+        clEnumValN(EVLOption::ForceEVLSupport,
+                   "force-explicit-vector-length-support",
+                   "Assume that the target supports vector length predication "
+                   "and generate VP intrinsics accordingly.")));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -1589,6 +1627,11 @@
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// Returns true if VP intrinsics should be generated in the tail folded loop.
+  bool useVPVectorization() const {
+    return PreferVPIntrinsics && foldTailByMasking();
+  }
+
   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
   /// nodes to the chain of instructions representing the reductions. Uses a
   /// MapVector to ensure deterministic iteration order.
@@ -1748,6 +1791,9 @@
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
   bool CanFoldTailByMasking = false;
 
+  /// Control whether to generate VP intrinsics in vectorized code.
+  bool PreferVPIntrinsics = false;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
@@ -2835,6 +2881,19 @@
   if (VectorTripCount)
     return VectorTripCount;
 
+  if (Cost->useVPVectorization()) {
+    Value *TC = getTripCount();
+    // Loop has multiple exits. Make sure scalar remainder executes at least 1
+    // scalar iteration to perform correct jump.
+    if (Cost->requiresScalarEpilogue(VF)) {
+
+      IRBuilder<> Builder(InsertBlock->getTerminator());
+      TC = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+                             "adj.for.rem.tc");
+    }
+    return VectorTripCount = TC;
+  }
+
   Value *TC = getTripCount();
   IRBuilder<> Builder(InsertBlock->getTerminator());
 
@@ -5153,6 +5212,34 @@
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   if (Legal->prepareToFoldTailByMasking()) {
     CanFoldTailByMasking = true;
+    if (PreferPredicateWithVPIntrinsics == EVLOption::NoPredication)
+      return MaxFactors;
+
+    if (UserIC > 1) {
+      LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                           "not generate VP intrinsics since interleave count "
+                           "specified is greater than 1.\n");
+      return MaxFactors;
+    }
+
+    if (MaxFactors.ScalableVF.isScalable() &&
+        MaxFactors.ScalableVF.isNonZero()) {
+      if (PreferPredicateWithVPIntrinsics == EVLOption::IfEVLSupported) {
+        // FIXME: use actual opcode/data type for analysis here.
+        PreferVPIntrinsics = TTI.hasActiveVectorLength(0, nullptr, Align());
+        LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                             "try to generate VP Intrinsics if the target "
+                             "support vector length predication.\n");
+      } else {
+        PreferVPIntrinsics = true;
+        LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                             "try to generate VP Intrinsics.\n");
+      }
+
+      if (PreferVPIntrinsics)
+        MaxFactors.FixedVF = ElementCount::getFixed(1);
+    }
+
     return MaxFactors;
   }
 
@@ -5725,6 +5812,11 @@
   if (!isScalarEpilogueAllowed())
     return 1;
 
+  // Do not interleave if VP intrinsics are preferred and no User IC is
+  // specified.
+  if (useVPVectorization())
+    return 1;
+
   // We used the distance for the interleave count.
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
@@ -8130,7 +8222,7 @@
   VPValue *BlockMask = nullptr;
 
   if (OrigLoop->getHeader() == BB) {
-    if (!CM.blockNeedsPredicationForAnyReason(BB))
+    if (!CM.blockNeedsPredicationForAnyReason(BB) || CM.useVPVectorization())
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
     assert(CM.foldTailByMasking() && "must fold the tail");
@@ -8689,7 +8781,7 @@
 // Add the necessary canonical IV and branch recipes required to control the
 // loop.
 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
-                                  TailFoldingStyle Style) {
+                                  TailFoldingStyle Style, bool NeedEVL) {
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   auto *StartV = Plan.getVPValueOrAddLiveIn(StartIdx);
 
@@ -8699,17 +8791,27 @@
   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
   Header->insert(CanonicalIVPHI, Header->begin());
 
+  VPEVLRecipe *VPEVL = nullptr;
+  if (NeedEVL) {
+    VPEVL = new VPEVLRecipe(Plan.getCanonicalIV(), &Plan.getVectorTripCount());
+    Header->appendRecipe(VPEVL);
+  }
+
   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
   // IV by VF * UF.
+  SmallVector<VPValue *, 2> Args(1, CanonicalIVPHI);
+  if (VPEVL)
+    Args.push_back(VPEVL);
+
   bool HasNUW = Style == TailFoldingStyle::None;
   auto *CanonicalIVIncrement =
       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
                                : VPInstruction::CanonicalIVIncrement,
-                        {CanonicalIVPHI}, DL, "index.next");
+                        Args, DL, "index.next");
   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
 
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
-  if (useActiveLaneMaskForControlFlow(Style)) {
+  if (!NeedEVL && useActiveLaneMaskForControlFlow(Style)) {
     // Create the active lane mask instruction in the vplan preheader.
     VPBasicBlock *VecPreheader =
         cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
@@ -8893,7 +8995,8 @@
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
-                        CM.getTailFoldingStyle(IVUpdateMayOverflow));
+                        CM.getTailFoldingStyle(IVUpdateMayOverflow),
+                        CM.useVPVectorization());
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -8961,7 +9064,11 @@
           HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
         // Move VPWidenIntOrFpInductionRecipes for optimized truncates to the
         // phi section of HeaderVPBB.
-        assert(isa<TruncInst>(Instr));
+        // Do the same if VPEVLRecipe was emitted for predicated
+        // vectorization support.
+        // TODO: try to avoid recipes moving by inserting them in proper place.
+        assert(isa<TruncInst>(Instr) ||
+               isa<VPEVLRecipe>(HeaderVPBB->getFirstNonPhi()));
         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
       } else
         VPBB->appendRecipe(Recipe);
@@ -9088,7 +9195,7 @@
   Term->eraseFromParent();
 
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
-                        CM.getTailFoldingStyle());
+                        CM.getTailFoldingStyle(), CM.useVPVectorization());
   return Plan;
 }
 
@@ -9650,6 +9757,18 @@
     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
   };
 
+  auto MaskValue = [&](unsigned Part, ElementCount EC) -> Value * {
+    VPValue *Mask = getMask();
+    assert(Mask && "MaskValue must be called for recipes with set mask");
+    // The outermost mask can be lowered as an all ones mask when using
+    // EVL.
+    if (auto *IMask = dyn_cast<VPInstruction>(Mask))
+      if (IMask->getOpcode() == VPInstruction::ICmpULE)
+        return Builder.getTrueVector(EC);
+
+    return BlockInMaskParts[Part];
+  };
+
   // Handle Stores:
   if (SI) {
     State.setDebugLocFromInst(SI);
@@ -9657,6 +9776,11 @@
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       Instruction *NewSI = nullptr;
       Value *StoredVal = State.get(StoredValue, Part);
+      // If EVL is not nullptr, then EVL must be a valid value set during plan
+      // creation, possibly default value = whole vector register length. EVL is
+      // created only if TTI prefers predicated vectorization, thus if EVL is
+      // not nullptr it also implies preference for predicated vectorization.
+      Value *EVLPart = State.EVL ? State.get(State.EVL, Part) : nullptr;
       if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(getAddr(), Part);
@@ -9672,11 +9796,27 @@
         }
         auto *VecPtr =
             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-        if (isMaskRequired)
+        // if EVLPart is not null, we can vectorize using predicated
+        // intrinsic.
+        if (EVLPart) {
+          auto *StoredValTy = cast<VectorType>(StoredVal->getType());
+          BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+          Function *VPIntr = Intrinsic::getDeclaration(
+              VectorPH->getModule(), Intrinsic::vp_store,
+              {StoredValTy, VecPtr->getType()});
+          Value *BlockInMaskPart =
+              isMaskRequired
+                  ? MaskValue(Part, StoredValTy->getElementCount())
+                  : Builder.getTrueVector(StoredValTy->getElementCount());
+
+          NewSI = Builder.CreateCall(
+              VPIntr, {StoredVal, VecPtr, BlockInMaskPart, EVLPart});
+        } else if (isMaskRequired) {
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
-        else
+        } else {
           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+        }
       }
       State.addMetadata(NewSI, SI);
     }
@@ -9688,6 +9828,13 @@
   State.setDebugLocFromInst(LI);
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
+
+    // If EVL is not nullptr, then EVL must be a valid value set during plan
+    // creation, possibly default value = whole vector register length. EVL is
+    // created only if TTI prefers predicated vectorization, thus if EVL is
+    // not nullptr it also implies preference for predicated vectorization.
+    Value *EVLPart = State.EVL ? State.get(State.EVL, Part) : nullptr;
+
     if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
@@ -9697,13 +9844,26 @@
     } else {
       auto *VecPtr =
           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-      if (isMaskRequired)
+      if (EVLPart) {
+        BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+        Function *VPIntr =
+            Intrinsic::getDeclaration(VectorPH->getModule(), Intrinsic::vp_load,
+                                      {DataTy, VecPtr->getType()});
+
+        Value *BlockInMaskPart =
+            isMaskRequired ? MaskValue(Part, DataTy->getElementCount())
+                           : Builder.getTrueVector(DataTy->getElementCount());
+
+        NewLI = Builder.CreateCall(VPIntr, {VecPtr, BlockInMaskPart, EVLPart},
+                                   "vp.op.load");
+      } else if (isMaskRequired) {
         NewLI = Builder.CreateMaskedLoad(
             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
             PoisonValue::get(DataTy), "wide.masked.load");
-      else
+      } else {
         NewLI =
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+      }
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       State.addMetadata(NewLI, LI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -242,6 +242,12 @@
   ElementCount VF;
   unsigned UF;
 
+  // If EVL is not nullptr, then EVL must be a valid value set during plan
+  // creation, possibly default value = whole vector register length. EVL is
+  // created only if TTI prefers predicated vectorization, thus if EVL is
+  // not nullptr it also implies preference for predicated vectorization.
+  VPValue *EVL = nullptr;
+
   /// Hold the indices to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
   /// instructions.
@@ -2049,6 +2055,43 @@
                    VPValue *Step, Type *Ty) const;
 };
 
+/// A recipe to generate Explicit Vector Length (EVL) value to be used with
+/// VPred intrinsics.
+/// The following three ways to compute the EVL parameter for the VP Intrinsics.
+///
+/// 1. The simplest way is to use the VF as EVL and rely solely on the mask
+/// parameter to control predication. The mask parameter is the same as computed
+/// for current tail-folding implementation.
+/// 2. The second way is to insert instructions to compute min(VF, trip_count -
+/// index) for each vector iteration.
+/// 3. For architectures like RISC-V, which have special instruction to
+/// compute/set an explicit vector length, we also introduce an experimental
+/// intrinsic set_vector_length, that can be lowered to architecture specific
+/// instruction(s) to compute EVL.
+class VPEVLRecipe : public VPRecipeBase, public VPValue {
+public:
+  VPEVLRecipe(VPValue *IV, VPValue *TC)
+      : VPRecipeBase(VPDef::VPEVLSC, {IV, TC}), VPValue(this) {}
+  ~VPEVLRecipe() override = default;
+
+  /// Return VPValue representing Induction Variable.
+  VPValue *getIV() const { return getOperand(0); }
+
+  /// Return VPValue representing vector trip count.
+  VPValue *getVectorTripCount() const { return getOperand(1); }
+
+  VP_CLASSOF_IMPL(VPDef::VPEVLSC)
+
+  /// Generate the instructions to compute EVL.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for generating the active lane mask for the vector loop that is
 /// used to predicate the vector operations.
 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -314,13 +314,22 @@
   case VPInstruction::CanonicalIVIncrement:
   case VPInstruction::CanonicalIVIncrementNUW: {
     Value *Next = nullptr;
-    if (Part == 0) {
+    bool HasEVL = getNumOperands() == 2;
+    Value *EVL = HasEVL ? State.get(getOperand(1), Part) : nullptr;
+    if (Part == 0 || EVL) {
       bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
       auto *Phi = State.get(getOperand(0), 0);
       // The loop step is equal to the vectorization factor (num of SIMD
       // elements) times the unroll factor (num of SIMD instructions).
+      if (HasEVL) {
+         EVL = Builder.CreateIntCast(EVL, Phi->getType(), /*isSigned=*/false);
+         if (State.UF > 1)
+           EVL = Builder.CreateMul(EVL,
+                                   ConstantInt::get(Phi->getType(), State.UF));
+      }
       Value *Step =
-          createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+          HasEVL ? EVL
+                 : createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
       Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
     } else {
       Next = State.get(this, 0);
@@ -1137,6 +1146,45 @@
 }
 #endif
 
+void VPEVLRecipe::execute(VPTransformState &State) {
+  // Set EVL
+  auto GetSetVL = [=](VPTransformState &State, Value *EVL) {
+    assert(EVL->getType()->isIntegerTy() &&
+           "Requested vector length should be an integer.");
+
+    // TODO: Add support for MaxSafeDist for correct loop emission.
+    Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+
+    Value *GVL = State.Builder.CreateIntrinsic(
+        State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
+        {EVL, VFArg, State.Builder.getInt1(State.VF.isScalable())});
+    return GVL;
+  };
+  // TODO: Restructure this code with an explicit remainder loop, vsetvli can be
+  // outside of the main loop that allows to use interleave or unroll in the
+  // main loop.
+  assert(State.UF < 2 && "Neither unrolling, nor interleaving is supported by "
+                         "predicated vectorization.");
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    // Compute TC - IV as the EVL(requested vector length).
+    Value *IV = State.get(getIV(), Part);
+    Value *TripCount = State.get(getVectorTripCount(), Part);
+    Value *EVL = State.Builder.CreateSub(TripCount, IV);
+    Value *SetVL = GetSetVL(State, EVL);
+    State.set(this, SetVL, Part);
+  }
+  State.EVL = this;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+                        VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = EXPLICIT-VECTOR-LENGTH";
+}
+#endif
+
 bool VPCanonicalIVPHIRecipe::isCanonical(
     InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
     Type *Ty) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -350,6 +350,7 @@
     VPWidenMemoryInstructionSC,
     VPWidenSC,
     VPWidenSelectSC,
+    VPEVLSC,
     // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
     VPPredInstPHISC,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll
@@ -0,0 +1,475 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S  %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S  %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S  %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP1]], i32 4, i1 true)
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[TMP4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP2]])
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[TMP6]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP2]])
+; IF-EVL-NEXT:    [[TMP7:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP9]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP2]])
+; IF-EVL-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP2]] to i64
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; IF-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[WIDE_TRIP_COUNT]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; FORCE-EVL-LABEL: @foo(
+; FORCE-EVL-NEXT:  entry:
+; FORCE-EVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; FORCE-EVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; FORCE-EVL:       for.body.preheader:
+; FORCE-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; FORCE-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FORCE-EVL:       vector.ph:
+; FORCE-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE-EVL:       vector.body:
+; FORCE-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FORCE-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FORCE-EVL-NEXT:    [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]]
+; FORCE-EVL-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP1]], i32 4, i1 true)
+; FORCE-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; FORCE-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
+; FORCE-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[TMP4]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP2]])
+; FORCE-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; FORCE-EVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
+; FORCE-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr [[TMP6]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP2]])
+; FORCE-EVL-NEXT:    [[TMP7:%.*]] = add nsw <vscale x 4 x i32> [[VP_OP_LOAD1]], [[VP_OP_LOAD]]
+; FORCE-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; FORCE-EVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; FORCE-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr [[TMP9]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP2]])
+; FORCE-EVL-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP2]] to i64
+; FORCE-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
+; FORCE-EVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; FORCE-EVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FORCE-EVL:       middle.block:
+; FORCE-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; FORCE-EVL:       scalar.ph:
+; FORCE-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[WIDE_TRIP_COUNT]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; FORCE-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; FORCE-EVL:       for.cond.cleanup.loopexit:
+; FORCE-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; FORCE-EVL:       for.cond.cleanup:
+; FORCE-EVL-NEXT:    ret void
+; FORCE-EVL:       for.body:
+; FORCE-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; FORCE-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; FORCE-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; FORCE-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; FORCE-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; FORCE-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; FORCE-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; FORCE-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-VP-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-VP:       for.body.preheader:
+; NO-VP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; NO-VP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]])
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP10]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; NO-VP-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; NO-VP-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.cond.cleanup.loopexit:
+; NO-VP-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define void @inteleave(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) {
+; IF-EVL-LABEL: @inteleave(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; IF-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; IF-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; IF-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 0
+; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 1
+; IF-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
+; IF-EVL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]])
+; IF-EVL-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[WIDE_TRIP_COUNT]])
+; IF-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
+; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP15]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]]
+; IF-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]]
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
+; IF-EVL-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP24:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]]
+; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]]
+; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
+; IF-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; IF-EVL-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; IF-EVL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
+; IF-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr [[TMP30]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]])
+; IF-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 8
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
+; IF-EVL-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; FORCE-EVL-LABEL: @inteleave(
+; FORCE-EVL-NEXT:  entry:
+; FORCE-EVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; FORCE-EVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; FORCE-EVL:       for.body.preheader:
+; FORCE-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; FORCE-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FORCE-EVL:       vector.ph:
+; FORCE-EVL-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; FORCE-EVL-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; FORCE-EVL-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; FORCE-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
+; FORCE-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; FORCE-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; FORCE-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE-EVL:       vector.body:
+; FORCE-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FORCE-EVL-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; FORCE-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; FORCE-EVL-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 0
+; FORCE-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 1
+; FORCE-EVL-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
+; FORCE-EVL-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]])
+; FORCE-EVL-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[WIDE_TRIP_COUNT]])
+; FORCE-EVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
+; FORCE-EVL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP10]]
+; FORCE-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; FORCE-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; FORCE-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; FORCE-EVL-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP15]]
+; FORCE-EVL-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
+; FORCE-EVL-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]]
+; FORCE-EVL-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP10]]
+; FORCE-EVL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; FORCE-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; FORCE-EVL-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; FORCE-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]]
+; FORCE-EVL-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
+; FORCE-EVL-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; FORCE-EVL-NEXT:    [[TMP24:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]]
+; FORCE-EVL-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]]
+; FORCE-EVL-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
+; FORCE-EVL-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
+; FORCE-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; FORCE-EVL-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; FORCE-EVL-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
+; FORCE-EVL-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr [[TMP30]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]])
+; FORCE-EVL-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; FORCE-EVL-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 8
+; FORCE-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
+; FORCE-EVL-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; FORCE-EVL-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; FORCE-EVL:       middle.block:
+; FORCE-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; FORCE-EVL:       scalar.ph:
+; FORCE-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; FORCE-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; FORCE-EVL:       for.cond.cleanup.loopexit:
+; FORCE-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; FORCE-EVL:       for.cond.cleanup:
+; FORCE-EVL-NEXT:    ret void
+; FORCE-EVL:       for.body:
+; FORCE-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; FORCE-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; FORCE-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; FORCE-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]]
+; FORCE-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; FORCE-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; FORCE-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; FORCE-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+; NO-VP-LABEL: @inteleave(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-VP-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-VP:       for.body.preheader:
+; NO-VP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; NO-VP-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; NO-VP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; NO-VP-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 0
+; NO-VP-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 1
+; NO-VP-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]]
+; NO-VP-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[WIDE_TRIP_COUNT]])
+; NO-VP-NEXT:    [[ACTIVE_LANE_MASK1:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP10]], i64 [[WIDE_TRIP_COUNT]])
+; NO-VP-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
+; NO-VP-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; NO-VP-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 [[TMP15]]
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP16]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP5]]
+; NO-VP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; NO-VP-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]]
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP22]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]], <vscale x 4 x i32> poison)
+; NO-VP-NEXT:    [[TMP23:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT:    [[TMP24:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD2]]
+; NO-VP-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]]
+; NO-VP-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
+; NO-VP-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP23]], ptr [[TMP27]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; NO-VP-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; NO-VP-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]]
+; NO-VP-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP24]], ptr [[TMP30]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK1]])
+; NO-VP-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 8
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP32]]
+; NO-VP-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.cond.cleanup.loopexit:
+; NO-VP-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP35]], [[TMP34]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !0
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -disable-output  %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -disable-output  %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -disable-output  %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF +  vp<[[IV]]> vp<[[EVL]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]> vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; FORCE-EVL: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; FORCE-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; FORCE-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; FORCE-EVL-EMPTY:
+; FORCE-EVL:      vector.ph:
+; FORCE-EVL-NEXT: Successor(s): vector loop
+; FORCE-EVL-EMPTY:
+; FORCE-EVL-NEXT: <x1> vector loop: {
+; FORCE-EVL-NEXT:  vector.body:
+; FORCE-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; FORCE-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; FORCE-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH
+; FORCE-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; FORCE-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>
+; FORCE-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; FORCE-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>
+; FORCE-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; FORCE-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; FORCE-EVL-NEXT:    WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>
+; FORCE-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF +  vp<[[IV]]> vp<[[EVL]]>
+; FORCE-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]> vp<[[VTC]]>
+; FORCE-EVL-NEXT:  No successors
+; FORCE-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    EMIT vp<[[MASK:%.+]]> = active lane mask vp<[[ST]]> vp<[[TC]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF +  vp<[[IV]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]> vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-vp-intrinsics.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S  %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S  %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s
+
+; RUN: opt -passes=loop-vectorize \
+; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -S  %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) {
+; IF-EVL-LABEL: @foo(
+; IF-EVL-NEXT:  entry:
+; IF-EVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-EVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL:       for.body.preheader:
+; IF-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; IF-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL:       vector.ph:
+; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-EVL:       vector.body:
+; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; IF-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; IF-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; IF-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; IF-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; IF-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-EVL:       middle.block:
+; IF-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-EVL:       scalar.ph:
+; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-EVL:       for.cond.cleanup.loopexit:
+; IF-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-EVL:       for.cond.cleanup:
+; IF-EVL-NEXT:    ret void
+; IF-EVL:       for.body:
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; IF-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; FORCE-EVL-LABEL: @foo(
+; FORCE-EVL-NEXT:  entry:
+; FORCE-EVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; FORCE-EVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; FORCE-EVL:       for.body.preheader:
+; FORCE-EVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; FORCE-EVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FORCE-EVL:       vector.ph:
+; FORCE-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; FORCE-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; FORCE-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; FORCE-EVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1
+; FORCE-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; FORCE-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; FORCE-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE-EVL:       vector.body:
+; FORCE-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FORCE-EVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FORCE-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; FORCE-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; FORCE-EVL-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; FORCE-EVL-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; FORCE-EVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; FORCE-EVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; FORCE-EVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; FORCE-EVL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; FORCE-EVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; FORCE-EVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; FORCE-EVL-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; FORCE-EVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; FORCE-EVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; FORCE-EVL-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; FORCE-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; FORCE-EVL-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; FORCE-EVL-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FORCE-EVL:       middle.block:
+; FORCE-EVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; FORCE-EVL:       scalar.ph:
+; FORCE-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; FORCE-EVL-NEXT:    br label [[FOR_BODY:%.*]]
+; FORCE-EVL:       for.cond.cleanup.loopexit:
+; FORCE-EVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; FORCE-EVL:       for.cond.cleanup:
+; FORCE-EVL-NEXT:    ret void
+; FORCE-EVL:       for.body:
+; FORCE-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; FORCE-EVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; FORCE-EVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; FORCE-EVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; FORCE-EVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; FORCE-EVL-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; FORCE-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; FORCE-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; FORCE-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-VP-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-VP:       for.body.preheader:
+; NO-VP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 15
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; NO-VP-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; NO-VP-NEXT:    [[TMP1:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP3]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; NO-VP-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP5]], i32 4, <16 x i1> [[TMP1]], <16 x i32> poison)
+; NO-VP-NEXT:    [[TMP6:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; NO-VP-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP6]], ptr [[TMP8]], i32 4, <16 x i1> [[TMP1]])
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; NO-VP-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.cond.cleanup.loopexit:
+; NO-VP-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP11]], [[TMP10]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -0,0 +1,123 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=if-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=force-explicit-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=FORCE-EVL %s
+
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=x86_64 -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i32 %N) {
+; IF-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; IF-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; IF-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; IF-EVL-EMPTY:
+; IF-EVL:      vector.ph:
+; IF-EVL-NEXT: Successor(s): vector loop
+; IF-EVL-EMPTY:
+; IF-EVL-NEXT: <x1> vector loop: {
+; IF-EVL-NEXT:  vector.body:
+; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; IF-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; IF-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]> vp<[[BETC]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; IF-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; IF-EVL-NEXT:    WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF +  vp<[[IV]]>
+; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]> vp<[[VTC]]>
+; IF-EVL-NEXT:  No successors
+; IF-EVL-NEXT: }
+
+; FORCE-EVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; FORCE-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; FORCE-EVL-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; FORCE-EVL-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; FORCE-EVL-EMPTY:
+; FORCE-EVL:      vector.ph:
+; FORCE-EVL-NEXT: Successor(s): vector loop
+; FORCE-EVL-EMPTY:
+; FORCE-EVL-NEXT: <x1> vector loop: {
+; FORCE-EVL-NEXT:  vector.body:
+; FORCE-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; FORCE-EVL-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; FORCE-EVL-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; FORCE-EVL-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]> vp<[[BETC]]>
+; FORCE-EVL-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; FORCE-EVL-NEXT:    WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]>
+; FORCE-EVL-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; FORCE-EVL-NEXT:    WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]>
+; FORCE-EVL-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; FORCE-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; FORCE-EVL-NEXT:    WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; FORCE-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF +  vp<[[IV]]>
+; FORCE-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]> vp<[[VTC]]>
+; FORCE-EVL-NEXT:  No successors
+; FORCE-EVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+; NO-VP-NEXT: Live-in vp<[[BETC:%[0-9]+]]> = backedge-taken count
+; NO-VP-NEXT: vp<[[TC:%[0-9]+]]> = original trip-count
+; NO-VP-EMPTY:
+; NO-VP:      vector.ph:
+; NO-VP-NEXT: Successor(s): vector loop
+; NO-VP-EMPTY:
+; NO-VP-NEXT: <x1> vector loop: {
+; NO-VP-NEXT:  vector.body:
+; NO-VP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+; NO-VP-NEXT:    vp<[[ST:%[0-9]+]]>    = SCALAR-STEPS vp<[[IV]]>, ir<1>
+; NO-VP-NEXT:    EMIT vp<[[VIV:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[IV]]>
+; NO-VP-NEXT:    EMIT vp<[[MASK:%[0-9]+]]> = icmp ule vp<[[VIV]]> vp<[[BETC]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load ir<[[GEP1]]>, vp<[[MASK]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+; NO-VP-NEXT:    WIDEN ir<[[LD2:%.+]]> = load ir<[[GEP2]]>, vp<[[MASK]]>
+; NO-VP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add nsw ir<[[LD2]]>, ir<[[LD1]]>
+; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+; NO-VP-NEXT:    WIDEN store ir<[[GEP3]]>, ir<[[ADD]]>, vp<[[MASK]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = VF * UF +  vp<[[IV]]>
+; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]> vp<[[VTC]]>
+; NO-VP-NEXT:  No successors
+; NO-VP-NEXT: }
+
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %add, ptr %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}