diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1364,6 +1364,12 @@
   /// Intrinsics") Use of %evl is discouraged when that is not the case.
   bool hasActiveVectorLength() const;
 
+  /// If the target uses custom instruction to compute
+  /// active vector length, use an intrinsic in the IR that will be lowered to
+  /// this instruction. Else, the IR will use instructions for computing Min(VF,
+  /// TripCount - Induction).
+  bool useCustomActiveVectorLengthIntrinsic() const;
+
   /// @}
 
   /// @}
@@ -1663,6 +1669,7 @@
   virtual unsigned getGISelRematGlobalCost() const = 0;
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength() const = 0;
+  virtual bool useCustomActiveVectorLengthIntrinsic() const = 0;
   virtual InstructionCost getInstructionLatency(const Instruction *I) = 0;
 };
 
@@ -2216,6 +2223,10 @@
     return Impl.hasActiveVectorLength();
   }
 
+  bool useCustomActiveVectorLengthIntrinsic() const override {
+    return Impl.useCustomActiveVectorLengthIntrinsic();
+  }
+
   InstructionCost getInstructionLatency(const Instruction *I) override {
     return Impl.getInstructionLatency(I);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -740,6 +740,8 @@
 
   bool hasActiveVectorLength() const { return false; }
 
+  bool useCustomActiveVectorLengthIntrinsic() const { return false; }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -43,6 +43,7 @@
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/TypeSize.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -2534,6 +2535,18 @@
                                          unsigned Index, unsigned FieldIndex,
                                          MDNode *DbgInfo);
 
+  /// Return an all true boolean vector of size and scalability \p NumElts.
+  Value *CreateTrueVector(ElementCount NumElts) {
+    VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts);
+    return Constant::getAllOnesValue(VTy);
+  }
+
+  /// Return an all false boolean vector of size and scalability \p NumElts.
+  Value *CreateFalseVector(ElementCount NumElts) {
+    VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts);
+    return ConstantAggregateZero::get(VTy);
+  }
+
 private:
   /// Helper function that creates an assume intrinsic call that
   /// represents an alignment assumption on the provided pointer \p PtrValue
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1342,6 +1342,26 @@
 
 //===---------------- Vector Predication Intrinsics --------------===//
 
+// Memory Intrinsics
+def int_vp_store : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty,
+                               LLVMAnyPointerType<LLVMMatchType<0>>,
+                               llvm_i32_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty],
+                             [ NoCapture<ArgIndex<1>>, IntrNoSync, IntrWriteMem,
+                               IntrArgMemOnly, IntrWillReturn,
+                               ImmArg<ArgIndex<2>> ]>;
+
+def int_vp_load  : DefaultAttrsIntrinsic<[ llvm_anyvector_ty],
+                             [ LLVMAnyPointerType<LLVMMatchType<0>>,
+                               llvm_i32_ty,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty],
+                             [ NoCapture<ArgIndex<0>>, IntrNoSync, IntrReadMem,
+                               IntrWillReturn, IntrArgMemOnly,
+                               ImmArg<ArgIndex<1>> ]>;
+
 // Speculatable Binary operators
 let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
   def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
@@ -1420,6 +1440,11 @@
             [llvm_anyint_ty, LLVMMatchType<1>],
             [IntrNoMem, IntrNoSync, IntrWillReturn]>;
 
+def int_experimental_set_vector_length: 
+  DefaultAttrsIntrinsic<[llvm_i32_ty],
+            [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem, IntrNoSync, IntrWillReturn]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_load:
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1033,6 +1033,14 @@
   return TTIImpl->supportsScalableVectors();
 }
 
+bool TargetTransformInfo::hasActiveVectorLength() const {
+  return TTIImpl->hasActiveVectorLength();
+}
+
+bool TargetTransformInfo::useCustomActiveVectorLengthIntrinsic() const {
+  return TTIImpl->useCustomActiveVectorLengthIntrinsic();
+}
+
 InstructionCost
 TargetTransformInfo::getInstructionLatency(const Instruction *I) const {
   return TTIImpl->getInstructionLatency(I);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -60,6 +60,7 @@
 #include "VPlanHCFGBuilder.h"
 #include "VPlanPredicator.h"
 #include "VPlanTransforms.h"
+#include "VPlanValue.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -233,6 +234,54 @@
                          "prefers tail-folding, don't attempt vectorization if "
                          "tail-folding fails.")));
 
+// Option prefer-predicate-with-vp-intrinsics is an experimental switch to
+// indicate that the loop vectorizer should try to generate VP intrinsics if
+// tail-folding is enabled (note that this option is dependent on the
+// prefer-predicate-over-epilogue option being set to predicate-dont-vectorize).
+// This can be particularly useful for targets like RISC-V and SX-Aurora that
+// support vector length predication.
+// Currently this switch takes four possible values:
+// 0. no-predication: Do not generate VP intrinsics.
+// 1. if-active-vector-length-supported: Only generate VP intrinsics if the
+// target supports active vector length based predication.
+// 2. without-avl-support: Generate VP intriniscs even if vector length based
+// predication is not supported. This will behave a bit like existing
+// tail-folding by using a mask for predication, except all instructions are
+// widened to VP intrinsics and not just memory instructions. Use of this
+// options is discouraged and is only meant for experimental/testing purpose.
+// 3. force-active-vector-length-support: This is purely an experimental/testing
+// option which will be removed in future. It forces the loop vectorizer to
+// assume that the target supports vector length predication.
+namespace PreferVPIntrinsicsTy {
+enum Option {
+  NoPredication = 0,
+  IfAVLSupported,
+  WithoutAVLSupport,
+  ForceAVLSupport
+};
+} // namespace PreferVPIntrinsicsTy
+
+static cl::opt<PreferVPIntrinsicsTy::Option> PreferPredicateWithVPIntrinsics(
+    "prefer-predicate-with-vp-intrinsics",
+    cl::init(PreferVPIntrinsicsTy::NoPredication), cl::Hidden,
+    cl::desc("When vectorizing with tail-folding, generate vector predication "
+             "intrinsics."),
+    cl::values(
+        clEnumValN(PreferVPIntrinsicsTy::NoPredication, "no-predication",
+                   "Do not generate VP intrinsics."),
+        clEnumValN(PreferVPIntrinsicsTy::IfAVLSupported,
+                   "if-active-vector-length-support",
+                   "Only generate VP intrinsics if the target supports vector "
+                   "length predication."),
+        clEnumValN(PreferVPIntrinsicsTy::WithoutAVLSupport,
+                   "without-active-vector-length-support",
+                   "Generate VP intrinsics even if vector length predication "
+                   "is not supported. This option is discouraged."),
+        clEnumValN(PreferVPIntrinsicsTy::ForceAVLSupport,
+                   "force-active-vector-length-support",
+                   "Assume that the target supports vector length predication "
+                   "and generate VP intrinsics accordingly.")));
+
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
     cl::desc("Maximize bandwidth when selecting vectorization factor which "
@@ -480,6 +529,11 @@
   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
                         VPTransformState &State);
 
+  /// Widen a single instruction to a VP intrinsic within the innermost loop.
+  void widenPredicatedInstruction(Instruction &I, VPValue *Def,
+                                  VPUser &Operands, VPTransformState &State,
+                                  VPValue *BlockInMask, VPValue *EVL);
+
   /// Widen a single call instruction within the innermost loop.
   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
                             VPTransformState &State);
@@ -548,7 +602,8 @@
   /// vectorized loop.
   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
                                   VPValue *Def, VPValue *Addr,
-                                  VPValue *StoredValue, VPValue *BlockInMask);
+                                  VPValue *StoredValue, VPValue *BlockInMask,
+                                  VPValue *EVL = nullptr);
 
   /// Set the debug location in the builder using the debug location in
   /// the instruction.
@@ -564,6 +619,10 @@
   /// element.
   virtual Value *getBroadcastInstrs(Value *V);
 
+  /// Create Instructions to compute Explicit Vector Length when using VP
+  /// intrinsics.
+  Value *createEVL();
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -1568,6 +1627,11 @@
     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
+  /// Returns true if VP intrinsics should be generated in the tail folded loop.
+  bool preferVPIntrinsics() const {
+    return foldTailByMasking() && PreferVPIntrinsics;
+  }
+
   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
   /// nodes to the chain of instructions representing the reductions. Uses a
   /// MapVector to ensure deterministic iteration order.
@@ -1704,6 +1768,9 @@
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
   bool FoldTailByMasking = false;
 
+  /// Control whether to generate VP intrinsics in vectorized code.
+  bool PreferVPIntrinsics = false;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
@@ -2834,7 +2901,7 @@
 
 void InnerLoopVectorizer::vectorizeMemoryInstruction(
     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
-    VPValue *StoredValue, VPValue *BlockInMask) {
+    VPValue *StoredValue, VPValue *BlockInMask, VPValue *EVL) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
@@ -2863,6 +2930,13 @@
   bool CreateGatherScatter =
       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
+  if (Reverse)
+    assert(!EVL &&
+           "Vector reverse not supported for predicated vectorization.");
+  if (CreateGatherScatter)
+    assert(!EVL && "Gather/Scatter operations not supported for "
+                   "predicated vectorization.");
+
   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
   // gather/scatter. Otherwise Decision should have been to Scalarize.
   assert((ConsecutiveStride || CreateGatherScatter) &&
@@ -2918,6 +2992,13 @@
     for (unsigned Part = 0; Part < UF; ++Part) {
       Instruction *NewSI = nullptr;
       Value *StoredVal = State.get(StoredValue, Part);
+
+      // If EVL is not nullptr, then EVL must be a valid value set during plan
+      // creation, possibly default value = whole vector register length. EVL is
+      // created only if TTI prefers predicated vectorization, thus if EVL is
+      // not nullptr it also implies preference for predicated vectorization.
+      Value *EVLPart = EVL ? State.get(EVL, Part) : nullptr;
+
       if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(Addr, Part);
@@ -2932,11 +3013,25 @@
           // another expression. So don't call resetVectorValue(StoredVal).
         }
         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
-        if (isMaskRequired)
+        // if EVLPart is not null, we can vectorize using predicated
+        // intrinsic.
+        if (EVLPart) {
+          assert(isMaskRequired &&
+                 "Mask argument is required for VP intrinsics.");
+          VectorType *StoredValTy = cast<VectorType>(StoredVal->getType());
+          Value *BlockInMaskPart = BlockInMaskParts[Part];
+          Value *EVLPartI32 = Builder.CreateSExtOrTrunc(
+              EVLPart, Type::getInt32Ty(Builder.getContext()));
+          NewSI = Builder.CreateIntrinsic(
+              Intrinsic::vp_store, {StoredValTy, VecPtr->getType()},
+              {StoredVal, VecPtr, Builder.getInt32(Alignment.value()),
+               BlockInMaskPart, EVLPartI32});
+        } else if (isMaskRequired) {
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
-        else
+        } else {
           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+        }
       }
       addMetadata(NewSI, SI);
     }
@@ -2948,6 +3043,13 @@
   setDebugLocFromInst(Builder, LI);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *NewLI;
+
+    // If EVL is not nullptr, then EVL must be a valid value set during plan
+    // creation, possibly default value = whole vector register length. EVL is
+    // created only if TTI prefers predicated vectorization, thus if EVL is
+    // not nullptr it also implies preference for predicated vectorization.
+    Value *EVLPart = EVL ? State.get(EVL, Part) : nullptr;
+
     if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(Addr, Part);
@@ -2956,13 +3058,26 @@
       addMetadata(NewLI, LI);
     } else {
       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
-      if (isMaskRequired)
+      if (EVLPart) {
+        assert(isMaskRequired &&
+               "Mask argument is required for VP intrinsics.");
+        Value *BlockInMaskPart = BlockInMaskParts[Part];
+        Value *EVLPartI32 = Builder.CreateSExtOrTrunc(
+            EVLPart, Type::getInt32Ty(Builder.getContext()));
+        NewLI = Builder.CreateIntrinsic(
+            Intrinsic::vp_load,
+            {VecPtr->getType()->getPointerElementType(), VecPtr->getType()},
+            {VecPtr, Builder.getInt32(Alignment.value()), BlockInMaskPart,
+             EVLPartI32},
+            nullptr, "vp.op.load");
+      } else if (isMaskRequired) {
         NewLI = Builder.CreateMaskedLoad(
             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
             "wide.masked.load");
-      else
+      } else {
         NewLI =
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+      }
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       addMetadata(NewLI, LI);
@@ -4840,6 +4955,73 @@
   return !CInt || CInt->isZero();
 }
 
+void InnerLoopVectorizer::widenPredicatedInstruction(Instruction &I,
+                                                     VPValue *Def, VPUser &User,
+                                                     VPTransformState &State,
+                                                     VPValue *BlockInMask,
+                                                     VPValue *EVL) {
+  auto getVPIntrInstr = [](unsigned Opcode) {
+    switch (Opcode) {
+    case Instruction::Add:
+      return Intrinsic::vp_add;
+    case Instruction::Sub:
+      return Intrinsic::vp_sub;
+    case Instruction::Mul:
+      return Intrinsic::vp_mul;
+    case Instruction::SDiv:
+      return Intrinsic::vp_sdiv;
+    case Instruction::UDiv:
+      return Intrinsic::vp_udiv;
+    case Instruction::SRem:
+      return Intrinsic::vp_srem;
+    case Instruction::URem:
+      return Intrinsic::vp_urem;
+    case Instruction::AShr:
+      return Intrinsic::vp_ashr;
+    case Instruction::LShr:
+      return Intrinsic::vp_lshr;
+    case Instruction::Shl:
+      return Intrinsic::vp_shl;
+    case Instruction::Or:
+      return Intrinsic::vp_or;
+    case Instruction::And:
+      return Intrinsic::vp_and;
+    case Instruction::Xor:
+      return Intrinsic::vp_xor;
+    }
+    return Intrinsic::not_intrinsic;
+  };
+
+  unsigned Opcode = I.getOpcode();
+  assert(getVPIntrInstr(Opcode) != Intrinsic::not_intrinsic &&
+         "Instruction does not have VP intrinsic support.");
+
+  // Just widen unops and binops.
+  setDebugLocFromInst(Builder, &I);
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    SmallVector<Value *, 2> Ops;
+    for (unsigned OpIdx = 0; OpIdx < User.getNumOperands() - 2; OpIdx++)
+      Ops.push_back(State.get(User.getOperand(OpIdx), Part));
+
+    VectorType *OpTy = cast<VectorType>(Ops[0]->getType());
+    Value *MaskOp = State.get(BlockInMask, Part);
+    Ops.push_back(MaskOp);
+
+    Value *EVLOp = State.get(EVL, Part);
+    Ops.push_back(EVLOp);
+
+    Value *V = Builder.CreateIntrinsic(getVPIntrInstr(Opcode), OpTy, Ops,
+                                       nullptr, "vp.op");
+    if (auto *VecOp = dyn_cast<Instruction>(V))
+      VecOp->copyIRFlags(&I);
+
+    // Use this vector value for all users of the original instruction.
+    State.set(Def, V, Part);
+    addMetadata(V, &I);
+  }
+}
+
 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
                                            VPUser &User,
                                            VPTransformState &State) {
@@ -5655,6 +5837,28 @@
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   if (Legal->prepareToFoldTailByMasking()) {
     FoldTailByMasking = true;
+    if (!PreferPredicateWithVPIntrinsics)
+      return MaxVF;
+
+    if (UserIC > 1) {
+      LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                           "not generate VP intrinsics since interleave count "
+                           "specified is greater than 1.\n");
+      return MaxVF;
+    }
+
+    if (PreferPredicateWithVPIntrinsics ==
+        PreferVPIntrinsicsTy::IfAVLSupported) {
+      PreferVPIntrinsics = TTI.hasActiveVectorLength();
+      LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                           "try to generate VP Intrinsics if the target "
+                           "support vector length predication.\n");
+    } else {
+      PreferVPIntrinsics = true;
+      LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+                           "try to generate VP Intrinsics.\n");
+    }
+
     return MaxVF;
   }
 
@@ -6132,6 +6336,11 @@
   if (!isScalarEpilogueAllowed())
     return 1;
 
+  // Do not interleave if VP intrinsics are preferred and no User IC is
+  // specified.
+  if (preferVPIntrinsics())
+    return 1;
+
   // We used the distance for the interleave count.
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
@@ -8350,6 +8559,19 @@
     if (!CM.blockNeedsPredication(BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
+    // if header block needs predication then it is only because tail-folding is
+    // enabled. If we are using VP intrinsics for a target with vector length
+    // predication support, this mask (icmp ule %IV %BTC) becomes redundant with
+    // EVL, which means unless we are using VP intrinsics without vector length
+    // predication support we can replace this mask with an all-true mask for
+    // possibly better latency.
+    if (CM.preferVPIntrinsics() &&
+        PreferPredicateWithVPIntrinsics !=
+            PreferVPIntrinsicsTy::WithoutAVLSupport) {
+      BlockMask = Builder.createNaryOp(VPInstruction::AllTrueMask, {});
+      return BlockMaskCache[BB] = BlockMask;
+    }
+
     // Create the block in mask as the first non-phi instruction in the block.
     VPBuilder::InsertPointGuard Guard(Builder);
     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
@@ -8398,8 +8620,17 @@
   return BlockMaskCache[BB] = BlockMask;
 }
 
-VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
-                                                VPlanPtr &Plan) {
+VPValue *VPRecipeBuilder::getOrCreateEVL(VPlanPtr &Plan) {
+  if (!EVL) {
+    auto *EVLRecipe = new VPWidenEVLRecipe();
+    Builder.getInsertBlock()->appendRecipe(EVLRecipe);
+    EVL = EVLRecipe->getEVL();
+  }
+  return EVL;
+}
+
+bool VPRecipeBuilder::validateWidenMemory(Instruction *I,
+                                          VFRange &Range) const {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
          "Must be called with either a load or store");
 
@@ -8418,7 +8649,12 @@
     return Decision != LoopVectorizationCostModel::CM_Scalarize;
   };
 
-  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+  return (LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range));
+}
+
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
+                                                VPlanPtr &Plan) {
+  if (!validateWidenMemory(I, Range))
     return nullptr;
 
   VPValue *Mask = nullptr;
@@ -8434,6 +8670,24 @@
   return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
 }
 
+VPRecipeBase *VPRecipeBuilder::tryToPredicatedWidenMemory(Instruction *I,
+                                                          VFRange &Range,
+                                                          VPlanPtr &Plan) {
+  if (!validateWidenMemory(I, Range))
+    return nullptr;
+
+  VPValue *Mask = createBlockInMask(I->getParent(), Plan);
+  VPValue *EVL = getOrCreateEVL(Plan);
+  VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
+  if (LoadInst *Load = dyn_cast<LoadInst>(I))
+    return new VPPredicatedWidenMemoryInstructionRecipe(*Load, Addr, Mask, EVL);
+
+  StoreInst *Store = cast<StoreInst>(I);
+  VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
+  return new VPPredicatedWidenMemoryInstructionRecipe(*Store, Addr, StoredValue,
+                                                      Mask, EVL);
+}
+
 VPWidenIntOrFpInductionRecipe *
 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
   // Check if this is an integer or fp induction. If so, build the recipe that
@@ -8563,7 +8817,11 @@
                                                              Range);
 }
 
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
+bool VPRecipeBuilder::preferPredicatedWiden() const {
+  return CM.preferVPIntrinsics();
+}
+
+bool VPRecipeBuilder::validateWiden(Instruction *I) const {
   auto IsVectorizableOpcode = [](unsigned Opcode) {
     switch (Opcode) {
     case Instruction::Add:
@@ -8605,13 +8863,28 @@
     return false;
   };
 
-  if (!IsVectorizableOpcode(I->getOpcode()))
+  return IsVectorizableOpcode(I->getOpcode());
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
+  if (!validateWiden(I))
     return nullptr;
 
   // Success: widen this instruction.
   return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
 }
 
+VPPredicatedWidenRecipe *VPRecipeBuilder::tryToPredicatedWiden(Instruction *I,
+                                                               VPlanPtr &Plan) {
+  if (!validateWiden(I))
+    return nullptr;
+
+  VPValue *Mask = createBlockInMask(I->getParent(), Plan);
+  VPValue *EVL = getOrCreateEVL(Plan);
+  return new VPPredicatedWidenRecipe(*I, Plan->mapToVPValues(I->operands()),
+                                     Mask, EVL);
+}
+
 VPBasicBlock *VPRecipeBuilder::handleReplication(
     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
     VPlanPtr &Plan) {
@@ -8700,8 +8973,12 @@
   if (auto *CI = dyn_cast<CallInst>(Instr))
     return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan));
 
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr)) {
+    if (preferPredicatedWiden()) {
+      return toVPRecipeResult(tryToPredicatedWidenMemory(Instr, Range, Plan));
+    }
     return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan));
+  }
 
   VPRecipeBase *Recipe;
   if (auto Phi = dyn_cast<PHINode>(Instr)) {
@@ -8738,6 +9015,9 @@
         *SI, Plan->mapToVPValues(SI->operands()), InvariantCond));
   }
 
+  if (preferPredicatedWiden()) {
+    return toVPRecipeResult(tryToPredicatedWiden(Instr, Plan));
+  }
   return toVPRecipeResult(tryToWiden(Instr, *Plan));
 }
 
@@ -9115,6 +9395,11 @@
   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
 }
 
+void VPPredicatedWidenRecipe::execute(VPTransformState &State) {
+  State.ILV->widenPredicatedInstruction(*getUnderlyingInstr(), this, *this,
+                                        State, getMask(), getEVL());
+}
+
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
                       *this, State.UF, State.VF, IsPtrLoopInvariant,
@@ -9321,6 +9606,63 @@
                                         getAddr(), StoredValue, getMask());
 }
 
+void VPPredicatedWidenMemoryInstructionRecipe::execute(
+    VPTransformState &State) {
+  VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
+  State.ILV->vectorizeMemoryInstruction(
+      &Ingredient, State, StoredValue ? nullptr : getVPValue(), getAddr(),
+      StoredValue, getMask(), getEVL());
+}
+
+Value *InnerLoopVectorizer::createEVL() {
+  assert(PreferPredicateWithVPIntrinsics !=
+             PreferVPIntrinsicsTy::NoPredication &&
+         "Predication with VP intrinsics turned off.");
+
+  if (PreferPredicateWithVPIntrinsics == PreferVPIntrinsicsTy::IfAVLSupported)
+    assert(TTI->hasActiveVectorLength() &&
+           "Target does not support vector length predication.");
+
+  auto *MinVF = Builder.getInt32(VF.getKnownMinValue());
+  Value *RuntimeVL =
+      VF.isScalable() ? Builder.CreateVScale(MinVF, "vscale.x.vf") : MinVF;
+
+  if (PreferPredicateWithVPIntrinsics ==
+          PreferVPIntrinsicsTy::WithoutAVLSupport &&
+      !TTI->hasActiveVectorLength()) {
+    return RuntimeVL;
+  }
+
+  Value *Remaining = Builder.CreateSub(TripCount, Induction);
+  // FIXME: This is a proof-of-concept naive implementation to demonstrate using
+  // a target dependent intrinisc to compute the vector length.
+  if (TTI->useCustomActiveVectorLengthIntrinsic()) {
+    // Set Element width to the widest type used in the loop.
+    unsigned SmallestType, WidestType;
+    std::tie(SmallestType, WidestType) = Cost->getSmallestAndWidestTypes();
+    Constant *ElementWidth = Builder.getInt32(WidestType);
+    // Set Register width factor to 1.
+    Constant *RegWidthFactor = Builder.getInt32(1);
+    return Builder.CreateIntrinsic(Intrinsic::experimental_set_vector_length,
+                                   {Remaining->getType()},
+                                   {Remaining, ElementWidth, RegWidthFactor});
+  }
+
+  Value *RuntimeVLExt = Builder.CreateZExt(RuntimeVL, Remaining->getType());
+  Value *EVL =
+      Builder.CreateBinaryIntrinsic(Intrinsic::umin, RuntimeVLExt, Remaining);
+  return Builder.CreateTrunc(EVL, Builder.getInt32Ty());
+}
+
+void VPWidenEVLRecipe::execute(VPTransformState &State) {
+  // FIXME: Interleaving with predicated vectorization is not yet supported.
+  // Since VPlan only provides set methods for per Part or per Instance, we use
+  // the per Part set method to store the same EVL for each Part (State.UF would
+  // be 1 for now.)
+  for (unsigned Part = 0; Part < State.UF; Part++)
+    State.set(getEVL(), State.ILV->createEVL(), Part);
+}
+
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
 // predication, and 4) a TTI hook that analyses whether the loop is suitable
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -41,6 +41,8 @@
 
   VPBuilder &Builder;
 
+  VPValue *EVL = nullptr;
+
   /// When we if-convert we need to create edge masks. We have to cache values
   /// so that we don't end up with exponential recursion/IR. Note that
   /// if-conversion currently takes place during VPlan-construction, so these
@@ -67,6 +69,15 @@
   VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range,
                                  VPlanPtr &Plan);
 
+  /// Similar to tryToWidenMemory, but create a predicated recipe. The
+  /// predicated recipe takes mandatory mask and EVL VPInstructions.
+  VPRecipeBase *tryToPredicatedWidenMemory(Instruction *I, VFRange &Range,
+                                           VPlanPtr &Plan);
+
+  /// Helper method used by tryToWidenMemory and tryToPredicatedWidenMemory to
+  /// validate if a memory instructions can be widened.
+  bool validateWidenMemory(Instruction *I, VFRange &Range) const;
+
   /// Check if an induction recipe should be constructed for \I. If so build and
   /// return it. If not, return null.
   VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
@@ -95,9 +106,19 @@
   /// that widening should be performed.
   VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const;
 
+  /// Similar to tryToWiden, but widen to VP intrinsics.
+  VPPredicatedWidenRecipe *tryToPredicatedWiden(Instruction *I, VPlanPtr &Plan);
+
+  /// Helper method used by tryToWiden and tryToPredicatedWiden to validate if
+  /// an instruction can be widened.
+  bool validateWiden(Instruction *I) const;
+
   /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue.
   VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; }
 
+  /// Create recipes that will expand to VP intrinsics.
+  bool preferPredicatedWiden() const;
+
 public:
   VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
                   LoopVectorizationLegality *Legal,
@@ -132,6 +153,10 @@
   /// and DST.
   VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
 
+  /// A helper function that computes the Explicit(Active) Vector Length for the
+  /// current vector iteration.
+  VPValue *getOrCreateEVL(VPlanPtr &Plan);
+
   /// Mark given ingredient for recording its recipe once one is created for
   /// it.
   void recordRecipeOf(Instruction *I) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -685,6 +685,7 @@
 
 inline bool VPUser::classof(const VPDef *Def) {
   return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
+         Def->getVPDefID() == VPRecipeBase::VPPredicatedWidenSC ||
          Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
          Def->getVPDefID() == VPRecipeBase::VPWidenCallSC ||
          Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC ||
@@ -694,6 +695,8 @@
          Def->getVPDefID() == VPRecipeBase::VPReplicateSC ||
          Def->getVPDefID() == VPRecipeBase::VPReductionSC ||
          Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC ||
+         Def->getVPDefID() ==
+             VPRecipeBase::VPPredicatedWidenMemoryInstructionSC ||
          Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
 }
 
@@ -712,6 +715,7 @@
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    AllTrueMask,
   };
 
 private:
@@ -830,6 +834,45 @@
 #endif
 };
 
+/// VPPredicatedWidenRecipe is a recipe for producing a copy of vector type
+/// using VP intrinsics for its ingredient. This recipe covers most of the
+/// traditional vectorization cases where each ingredient transforms into a
+/// vectorized version of itself.
+class VPPredicatedWidenRecipe : public VPRecipeBase, public VPValue {
+public:
+  template <typename IterT>
+  VPPredicatedWidenRecipe(Instruction &I, iterator_range<IterT> Operands,
+                          VPValue *Mask, VPValue *EVL)
+      : VPRecipeBase(VPRecipeBase::VPPredicatedWidenSC, Operands),
+        VPValue(VPValue::VPVPredicatedWidenSC, &I, this) {
+    addOperand(Mask);
+    addOperand(EVL);
+  }
+
+  ~VPPredicatedWidenRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPPredicatedWidenSC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVPredicatedWidenSC;
+  }
+
+  /// Return the mask used by this recipe.
+  VPValue *getMask() const { return getOperand(getNumOperands() - 2); }
+
+  /// Return the explicit vector length used by this recipe.
+  VPValue *getEVL() const { return getOperand(getNumOperands() - 1); }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
 /// A recipe for widening Call instructions.
 class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
 
@@ -1300,6 +1343,33 @@
   }
 };
 
+/// A recipe to generate Explicit Vector Length (EVL) value to be used with
+/// VPred intrinsics.
+class VPWidenEVLRecipe : public VPRecipeBase, public VPValue {
+
+public:
+  VPWidenEVLRecipe()
+      : VPRecipeBase(VPRecipeBase::VPWidenEVLSC, {}),
+        VPValue(VPValue::VPVWidenEVLSC, nullptr, this) {}
+  ~VPWidenEVLRecipe() override = default;
+
+  /// Return the VPValue representing EVL.
+  const VPValue *getEVL() const { return this; }
+  VPValue *getEVL() { return this; }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPRecipeBase::VPWidenEVLSC;
+  }
+
+  /// Generate the instructions to compute EVL.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
 /// order to merge values that are set under such a branch and feed their uses.
@@ -1398,6 +1468,68 @@
 #endif
 };
 
+/// A Recipe for widening load/store operations to VP intrinsics.
+/// The recipe uses the following VPValues:
+/// - For load: Address, mask, EVL
+/// - For store: Address, stored value, mask, EVL
+class VPPredicatedWidenMemoryInstructionRecipe : public VPRecipeBase {
+  Instruction &Ingredient;
+
+public:
+  VPPredicatedWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr,
+                                           VPValue *Mask, VPValue *EVL)
+      : VPRecipeBase(VPPredicatedWidenMemoryInstructionSC, {Addr, Mask, EVL}),
+        Ingredient(Load) {
+    new VPValue(VPValue::VPVPredicatedMemoryInstructionSC, &Load, this);
+  }
+
+  VPPredicatedWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+                                           VPValue *StoredValue, VPValue *Mask,
+                                           VPValue *EVL)
+      : VPRecipeBase(VPPredicatedWidenMemoryInstructionSC,
+                     {Addr, StoredValue, Mask, EVL}),
+        Ingredient(Store) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() ==
+           VPRecipeBase::VPPredicatedWidenMemoryInstructionSC;
+  }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getAddr() const {
+    return getOperand(0); // Address is the 1st, mandatory operand.
+  }
+
+  /// Return the mask used by this recipe.
+  VPValue *getMask() const {
+    // Mask is the second last, mandatory operand.
+    return getOperand(getNumOperands() - 2);
+  }
+
+  /// Return the EVL used by this recipe.
+  VPValue *getEVL() const {
+    // EVL is the last, mandatory operand.
+    return getOperand(getNumOperands() - 1);
+  }
+
+  /// Returns true if this recipe is a store.
+  bool isStore() const { return isa<StoreInst>(Ingredient); }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getStoredValue() const {
+    assert(isStore() && "Stored value only available for store instructions");
+    return getOperand(1); // Stored value is the 2nd, mandatory operand.
+  }
+
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPRecipeBase {
 public:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -598,6 +598,12 @@
     State.set(this, Call, Part);
     break;
   }
+  case VPInstruction::AllTrueMask: {
+    Value *AllTrueMask = Builder.CreateTrueVector(State.VF);
+    State.set(this, AllTrueMask, Part);
+    break;
+  }
+
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -640,7 +646,9 @@
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
-
+  case VPInstruction::AllTrueMask:
+    O << "all true mask";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -980,6 +988,14 @@
   printOperands(O, SlotTracker);
 }
 
+void VPPredicatedWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+                                    VPSlotTracker &SlotTracker) const {
+  O << Indent << "PREDICATED-WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
+  printOperands(O, SlotTracker);
+}
+
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-INDUCTION";
@@ -1084,6 +1100,19 @@
 }
 #endif
 
+void VPPredicatedWidenMemoryInstructionRecipe::print(
+    raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
+  O << Indent << "PREDICATED-WIDEN ";
+
+  if (!isStore()) {
+    getVPValue()->printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
+
+  printOperands(O, SlotTracker);
+}
+
 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
   Value *CanonicalIV = State.CanonicalIV;
   Type *STy = CanonicalIV->getType();
@@ -1118,6 +1147,13 @@
 }
 #endif
 
+void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  getEVL()->printAsOperand(O, SlotTracker);
+  O << " = GENERATE-EXPLICIT-VECTOR-LENGTH";
+}
+
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -51,6 +51,7 @@
   friend class VPSlotTracker;
   friend class VPRecipeBase;
   friend class VPWidenMemoryInstructionRecipe;
+  friend class VPPredicatedWidenMemoryInstructionRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -94,10 +95,13 @@
     VPVInstructionSC,
     VPVMemoryInstructionSC,
     VPVPredInstPHI,
+    VPVPredicatedMemoryInstructionSC,
+    VPVPredicatedWidenSC,
     VPVReductionSC,
     VPVReplicateSC,
     VPVWidenSC,
     VPVWidenCallSC,
+    VPVWidenEVLSC,
     VPVWidenGEPSC,
     VPVWidenIntOrFpIndcutionSC,
     VPVWidenPHISC,
@@ -303,10 +307,13 @@
     VPInstructionSC,
     VPInterleaveSC,
     VPPredInstPHISC,
+    VPPredicatedWidenMemoryInstructionSC,
+    VPPredicatedWidenSC,
     VPReductionSC,
     VPReplicateSC,
     VPWidenCallSC,
     VPWidenCanonicalIVSC,
+    VPWidenEVLSC,
     VPWidenGEPSC,
     VPWidenIntOrFpInductionSC,
     VPWidenMemoryInstructionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s
+
+; RUN: opt -loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -S  %s 2>&1 | FileCheck --check-prefix=IF-AVL %s
+
+; RUN: opt -loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -S  %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s
+
+; RUN: opt -loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -S  %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) local_unnamed_addr {
+; WITHOUT-AVL-LABEL: @foo(
+; WITHOUT-AVL-NEXT:  entry:
+; WITHOUT-AVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; WITHOUT-AVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; WITHOUT-AVL:       for.body.preheader:
+; WITHOUT-AVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; WITHOUT-AVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; WITHOUT-AVL:       vector.ph:
+; WITHOUT-AVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3
+; WITHOUT-AVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; WITHOUT-AVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; WITHOUT-AVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1
+; WITHOUT-AVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
+; WITHOUT-AVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; WITHOUT-AVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; WITHOUT-AVL:       vector.body:
+; WITHOUT-AVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; WITHOUT-AVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; WITHOUT-AVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; WITHOUT-AVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; WITHOUT-AVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; WITHOUT-AVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
+; WITHOUT-AVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; WITHOUT-AVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
+; WITHOUT-AVL-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; WITHOUT-AVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], i32 4)
+; WITHOUT-AVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; WITHOUT-AVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; WITHOUT-AVL-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; WITHOUT-AVL-NEXT:    [[VP_OP_LOAD3:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], i32 4)
+; WITHOUT-AVL-NEXT:    [[VP_OP:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[VP_OP_LOAD3]], <4 x i32> [[VP_OP_LOAD]], <4 x i1> [[TMP1]], i32 4)
+; WITHOUT-AVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; WITHOUT-AVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; WITHOUT-AVL-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; WITHOUT-AVL-NEXT:    call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> [[VP_OP]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], i32 4)
+; WITHOUT-AVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; WITHOUT-AVL-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WITHOUT-AVL-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WITHOUT-AVL:       middle.block:
+; WITHOUT-AVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; WITHOUT-AVL:       scalar.ph:
+; WITHOUT-AVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; WITHOUT-AVL-NEXT:    br label [[FOR_BODY:%.*]]
+; WITHOUT-AVL:       for.cond.cleanup.loopexit:
+; WITHOUT-AVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; WITHOUT-AVL:       for.cond.cleanup:
+; WITHOUT-AVL-NEXT:    ret void
+; WITHOUT-AVL:       for.body:
+; WITHOUT-AVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; WITHOUT-AVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; WITHOUT-AVL-NEXT:    [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; WITHOUT-AVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; WITHOUT-AVL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; WITHOUT-AVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; WITHOUT-AVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; WITHOUT-AVL-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; WITHOUT-AVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; WITHOUT-AVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; WITHOUT-AVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
+; IF-AVL-LABEL: @foo(
+; IF-AVL-NEXT:  entry:
+; IF-AVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; IF-AVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; IF-AVL:       for.body.preheader:
+; IF-AVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; IF-AVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-AVL:       vector.ph:
+; IF-AVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3
+; IF-AVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; IF-AVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-AVL-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1
+; IF-AVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
+; IF-AVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IF-AVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; IF-AVL:       vector.body:
+; IF-AVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-AVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; IF-AVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; IF-AVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; IF-AVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; IF-AVL-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
+; IF-AVL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; IF-AVL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
+; IF-AVL-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; IF-AVL-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison)
+; IF-AVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; IF-AVL-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; IF-AVL-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; IF-AVL-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison)
+; IF-AVL-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; IF-AVL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; IF-AVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; IF-AVL-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; IF-AVL-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP1]])
+; IF-AVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; IF-AVL-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-AVL-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; IF-AVL:       middle.block:
+; IF-AVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; IF-AVL:       scalar.ph:
+; IF-AVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; IF-AVL-NEXT:    br label [[FOR_BODY:%.*]]
+; IF-AVL:       for.cond.cleanup.loopexit:
+; IF-AVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; IF-AVL:       for.cond.cleanup:
+; IF-AVL-NEXT:    ret void
+; IF-AVL:       for.body:
+; IF-AVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-AVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; IF-AVL-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; IF-AVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; IF-AVL-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; IF-AVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; IF-AVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; IF-AVL-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; IF-AVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; IF-AVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; IF-AVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
+; FORCE-AVL-LABEL: @foo(
+; FORCE-AVL-NEXT:  entry:
+; FORCE-AVL-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; FORCE-AVL-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; FORCE-AVL:       for.body.preheader:
+; FORCE-AVL-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; FORCE-AVL-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FORCE-AVL:       vector.ph:
+; FORCE-AVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3
+; FORCE-AVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; FORCE-AVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; FORCE-AVL-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE-AVL:       vector.body:
+; FORCE-AVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FORCE-AVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; FORCE-AVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; FORCE-AVL-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; FORCE-AVL-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; FORCE-AVL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; FORCE-AVL-NEXT:    [[TMP2:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]]
+; FORCE-AVL-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 4, i64 [[TMP2]])
+; FORCE-AVL-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; FORCE-AVL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; FORCE-AVL-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; FORCE-AVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[TMP4]])
+; FORCE-AVL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; FORCE-AVL-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; FORCE-AVL-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; FORCE-AVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[TMP4]])
+; FORCE-AVL-NEXT:    [[VP_OP:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[VP_OP_LOAD1]], <4 x i32> [[VP_OP_LOAD]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[TMP4]])
+; FORCE-AVL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; FORCE-AVL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; FORCE-AVL-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; FORCE-AVL-NEXT:    call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> [[VP_OP]], <4 x i32>* [[TMP12]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 [[TMP4]])
+; FORCE-AVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; FORCE-AVL-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; FORCE-AVL-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; FORCE-AVL:       middle.block:
+; FORCE-AVL-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; FORCE-AVL:       scalar.ph:
+; FORCE-AVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; FORCE-AVL-NEXT:    br label [[FOR_BODY:%.*]]
+; FORCE-AVL:       for.cond.cleanup.loopexit:
+; FORCE-AVL-NEXT:    br label [[FOR_COND_CLEANUP]]
+; FORCE-AVL:       for.cond.cleanup:
+; FORCE-AVL-NEXT:    ret void
+; FORCE-AVL:       for.body:
+; FORCE-AVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; FORCE-AVL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; FORCE-AVL-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; FORCE-AVL-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; FORCE-AVL-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; FORCE-AVL-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]]
+; FORCE-AVL-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; FORCE-AVL-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; FORCE-AVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; FORCE-AVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; FORCE-AVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
+; NO-VP-LABEL: @foo(
+; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; NO-VP-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; NO-VP:       for.body.preheader:
+; NO-VP-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; NO-VP-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP:       vector.ph:
+; NO-VP-NEXT:    [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3
+; NO-VP-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; NO-VP-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-VP-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; NO-VP-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP:       vector.body:
+; NO-VP-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; NO-VP-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; NO-VP-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; NO-VP-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-NEXT:    [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
+; NO-VP-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
+; NO-VP-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison)
+; NO-VP-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; NO-VP-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; NO-VP-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison)
+; NO-VP-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]]
+; NO-VP-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; NO-VP-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; NO-VP-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; NO-VP-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP1]])
+; NO-VP-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; NO-VP-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP:       middle.block:
+; NO-VP-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; NO-VP:       scalar.ph:
+; NO-VP-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; NO-VP-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP:       for.cond.cleanup.loopexit:
+; NO-VP-NEXT:    br label [[FOR_COND_CLEANUP]]
+; NO-VP:       for.cond.cleanup:
+; NO-VP-NEXT:    ret void
+; NO-VP:       for.body:
+; NO-VP-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; NO-VP-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; NO-VP-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; NO-VP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; NO-VP-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; NO-VP-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NO-VP-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; NO-VP-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s
+
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=IF-AVL %s
+
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s
+
+; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \
+; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mattr=+avx512f -disable-output  %s 2>&1 | FileCheck --check-prefix=NO-VP %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) local_unnamed_addr {
+; WITHOUT-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; WITHOUT-AVL-NEXT: for.body:
+; WITHOUT-AVL-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; WITHOUT-AVL-NEXT:   EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
+; WITHOUT-AVL-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv>
+; WITHOUT-AVL-NEXT:   EMIT vp<%4> = GENERATE-EXPLICIT-VECTOR-LENGTH
+; WITHOUT-AVL-NEXT:   PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%2>, vp<%4>
+; WITHOUT-AVL-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv>
+; WITHOUT-AVL-NEXT:   PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2>, vp<%4>
+; WITHOUT-AVL-NEXT:   PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%2>, vp<%4>
+; WITHOUT-AVL-NEXT:   CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv>
+; WITHOUT-AVL-NEXT:   PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2>, vp<%4>
+; WITHOUT-AVL-NEXT: No successors
+; WITHOUT-AVL-NEXT: }
+
+; IF-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; IF-AVL-NEXT: for.body:
+; IF-AVL-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; IF-AVL-NEXT:   EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
+; IF-AVL-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv>
+; IF-AVL-NEXT:   WIDEN ir<%0> = load ir<%arrayidx>, vp<%2>
+; IF-AVL-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv>
+; IF-AVL-NEXT:   WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2>
+; IF-AVL-NEXT:   WIDEN ir<%add> = add ir<%1>, ir<%0>
+; IF-AVL-NEXT:   CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv>
+; IF-AVL-NEXT:   WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2>
+; IF-AVL-NEXT: No successors
+; IF-AVL-NEXT: }
+
+; FORCE-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; FORCE-AVL-NEXT: for.body:
+; FORCE-AVL-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; FORCE-AVL-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv>
+; FORCE-AVL-NEXT:   EMIT vp<%2> = all true mask
+; FORCE-AVL-NEXT:   EMIT vp<%3> = GENERATE-EXPLICIT-VECTOR-LENGTH
+; FORCE-AVL-NEXT:   PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%2>, vp<%3>
+; FORCE-AVL-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv>
+; FORCE-AVL-NEXT:   PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2>, vp<%3>
+; FORCE-AVL-NEXT:   PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%2>, vp<%3>
+; FORCE-AVL-NEXT:   CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv>
+; FORCE-AVL-NEXT:   PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2>, vp<%3>
+; FORCE-AVL-NEXT: No successors
+; FORCE-AVL-NEXT: }
+
+; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' {
+; NO-VP-NEXT: for.body:
+; NO-VP-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
+; NO-VP-NEXT:   EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
+; NO-VP-NEXT:   CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv>
+; NO-VP-NEXT:   WIDEN ir<%0> = load ir<%arrayidx>, vp<%2>
+; NO-VP-NEXT:   CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv>
+; NO-VP-NEXT:   WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2>
+; NO-VP-NEXT:   WIDEN ir<%add> = add ir<%1>, ir<%0>
+; NO-VP-NEXT:   CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv>
+; NO-VP-NEXT:   WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2>
+; NO-VP-NEXT: No successors
+; NO-VP-NEXT: }
+
+entry:
+  %cmp10 = icmp sgt i32 %N, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}