diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -162,7 +162,7 @@
   bool skipScalarizationCost() const { return ScalarizationCost.isValid(); }
 };
 
-enum class PredicationStyle { None, Data, DataAndControlFlow };
+enum class PredicationStyle { None, Data, DataAndControlFlow, ImplictData };
 
 class TargetTransformInfo;
 typedef TargetTransformInfo TTI;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -72,7 +72,7 @@
   bool supportsScalableVectors() const { return ST->hasVInstructions(); }
   bool enableScalableVectorization() const { return ST->hasVInstructions(); }
   PredicationStyle emitGetActiveLaneMask() const {
-    return ST->hasVInstructions() ? PredicationStyle::Data
+    return ST->hasVInstructions() ? PredicationStyle::ImplictData
                                   : PredicationStyle::None;
   }
   Optional<unsigned> getMaxVScale() const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -112,6 +112,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -208,30 +209,35 @@
 // and predicate the instructions accordingly. If tail-folding fails, there are
 // different fallback strategies depending on these values:
 namespace PreferPredicateTy {
-  enum Option {
-    ScalarEpilogue = 0,
-    PredicateElseScalarEpilogue,
-    PredicateOrDontVectorize
-  };
+enum Option {
+  ScalarEpilogue = 0,
+  PredicateElseScalarEpilogue,
+  VlElseScalarEpilogue,
+  PredicateOrDontVectorize
+};
 } // namespace PreferPredicateTy
 
 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
     "prefer-predicate-over-epilogue",
-    cl::init(PreferPredicateTy::ScalarEpilogue),
-    cl::Hidden,
+    cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden,
     cl::desc("Tail-folding and predication preferences over creating a scalar "
              "epilogue loop."),
-    cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
-                         "scalar-epilogue",
-                         "Don't tail-predicate loops, create scalar epilogue"),
-              clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
-                         "predicate-else-scalar-epilogue",
-                         "prefer tail-folding, create scalar epilogue if tail "
-                         "folding fails."),
-              clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
-                         "predicate-dont-vectorize",
-                         "prefers tail-folding, don't attempt vectorization if "
-                         "tail-folding fails.")));
+    cl::values(
+        clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue",
+                   "Don't tail-predicate loops, create scalar epilogue"),
+        clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
+                   "predicate-else-scalar-epilogue",
+                   "prefer tail-folding, create scalar epilogue if tail "
+                   "folding fails."),
+        clEnumValN(PreferPredicateTy::VlElseScalarEpilogue,
+                   "vlen-else-scalar-epilogue",
+                   "prefer tail-folding, we use dynamic vlen to support it. "
+                   "Only works for rvv, create scalar epilogue if tail "
+                   "folding fails."),
+        clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
+                   "predicate-dont-vectorize",
+                   "prefers tail-folding, don't attempt vectorization if "
+                   "tail-folding fails.")));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -1122,6 +1128,8 @@
   // Loop hint predicate indicating an epilogue is undesired.
   CM_ScalarEpilogueNotNeededUsePredicate,
 
+  CM_ScalarEpilogueNotNeededUseVl,
+
   // Directive indicating we must either tail fold or not vectorize
   CM_ScalarEpilogueNotAllowedUsePredicate
 };
@@ -1524,6 +1532,8 @@
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const { return FoldTailByMasking; }
 
+  bool foldTailByImplictMasking() const { return FoldTailByImplictMasking; }
+
   /// Returns true if were tail-folding and want to use the active lane mask
   /// for vector loop control flow.
   bool useActiveLaneMaskForControlFlow() const {
@@ -1700,6 +1710,10 @@
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
   bool FoldTailByMasking = false;
 
+  /// All blocks of loop are to be masked to fold tail of scalar iterations by
+  /// vl.
+  bool FoldTailByImplictMasking = false;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
@@ -5061,6 +5075,11 @@
                << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
+  case CM_ScalarEpilogueNotNeededUseVl:
+    LLVM_DEBUG(errs() << "LV: vector Vl hint/switch found.\n"
+                      << "LV: Not allowing scalar epilogue, creating vlenset "
+                      << "vector loop.\n";);
+    break;
   case CM_ScalarEpilogueNotAllowedLowTripLoop:
     // fallthrough as a special case of OptForSize
   case CM_ScalarEpilogueNotAllowedOptSize:
@@ -5086,9 +5105,11 @@
   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
     // If there was a tail-folding hint/switch, but we can't fold the tail by
     // masking, fallback to a vectorization with a scalar epilogue.
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
-                           "scalar epilogue instead.\n");
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+        ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) {
+      LLVM_DEBUG(
+          dbgs() << "LV: Cannot fold tail by masking or vl: vectorize with a "
+                    "scalar epilogue instead.\n");
       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
       return computeFeasibleMaxVF(TC, UserVF, false);
     }
@@ -5138,12 +5159,15 @@
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   if (Legal->prepareToFoldTailByMasking()) {
     FoldTailByMasking = true;
+    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl)
+      FoldTailByImplictMasking = true;
     return MaxFactors;
   }
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+      ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) {
     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                          "scalar epilogue instead.\n");
     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
@@ -7613,7 +7637,11 @@
 
   // 1. Set up the skeleton for vectorization, including vector pre-header and
   // middle block. The vector loop is created during VPlan execution.
-  VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
+  unsigned WidestType;
+
+  std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
+  VPTransformState State{BestVF,      BestUF, LI,         DT,
+                         ILV.Builder, &ILV,   &BestVPlan, WidestType};
   Value *CanonicalIVStartValue;
   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
       ILV.createVectorizedLoopSkeleton();
@@ -8091,8 +8119,22 @@
     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
     if (EmitGetActiveLaneMask != PredicationStyle::None) {
       VPValue *TC = Plan->getOrCreateTripCount();
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
-                                       nullptr, "active.lane.mask");
+      if (EmitGetActiveLaneMask == PredicationStyle::Data) {
+        assert((!CM.foldTailByImplictMasking()) &&
+               "target doesn't support dynamic vl");
+        BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
+                                         {IV, TC}, nullptr, "active.lane.mask");
+      }
+      // riscv vsetvl is implicit predicate ImplictData
+      else {
+        if (CM.foldTailByImplictMasking())
+          BlockMask = Builder.createNaryOp(VPInstruction::GetDynamicVl,
+                                           {IV, TC}, nullptr, "get.dynamic.vl");
+        else
+          BlockMask =
+              Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
+                                   nullptr, "active.lane.mask");
+      }
     } else {
       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
@@ -9746,6 +9788,11 @@
       Value *StoredVal = State.get(StoredValue, Part);
       if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        assert(
+            (MaskPart == nullptr || !MaskPart->getType()->isIntegerTy()) &&
+            " -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue not "
+            "support gather scatter, you maybe prefer "
+            "-prefer-predicate-over-epilogue=predicate-else-scalar-epilogue");
         Value *VectorGep = State.get(getAddr(), Part);
         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
                                             MaskPart);
@@ -9759,10 +9806,17 @@
         }
         auto *VecPtr =
             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-        if (isMaskRequired)
-          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
-                                            BlockInMaskParts[Part]);
-        else
+        if (isMaskRequired) {
+          Value *Mask = BlockInMaskParts[Part];
+          Type *MaskTy = Mask->getType();
+          if (MaskTy->isIntegerTy())
+            NewSI = Builder.CreateIntrinsic(Intrinsic::riscv_vse,
+                                            {StoredVal->getType(), MaskTy},
+                                            {StoredVal, VecPtr, Mask}, nullptr);
+          else
+            NewSI =
+                Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask);
+        } else
           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
       }
       State.addMetadata(NewSI, SI);
@@ -9777,6 +9831,10 @@
     Value *NewLI;
     if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      assert((MaskPart == nullptr || !MaskPart->getType()->isIntegerTy()) &&
+             " -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue not "
+             "support gather scatter, you maybe prefer "
+             "-prefer-predicate-over-epilogue=predicate-else-scalar-epilogue");
       Value *VectorGep = State.get(getAddr(), Part);
       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
                                          nullptr, "wide.masked.gather");
@@ -9784,11 +9842,18 @@
     } else {
       auto *VecPtr =
           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-      if (isMaskRequired)
-        NewLI = Builder.CreateMaskedLoad(
-            DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
-            PoisonValue::get(DataTy), "wide.masked.load");
-      else
+      if (isMaskRequired) {
+        Value *Mask = BlockInMaskParts[Part]; // State.get(getMask(), Part);
+        Type *MaskTy = Mask->getType();
+        if (MaskTy->isIntegerTy())
+          NewLI = Builder.CreateIntrinsic(
+              Intrinsic::riscv_vle, {DataTy, MaskTy},
+              {UndefValue::get(DataTy), VecPtr, Mask}, nullptr);
+        else
+          NewLI = Builder.CreateMaskedLoad(
+              DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
+              PoisonValue::get(DataTy), "wide.masked.load");
+      } else
         NewLI =
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
 
@@ -9829,6 +9894,8 @@
     switch (PreferPredicateOverEpilogue) {
     case PreferPredicateTy::ScalarEpilogue:
       return CM_ScalarEpilogueAllowed;
+    case PreferPredicateTy::VlElseScalarEpilogue:
+      return CM_ScalarEpilogueNotNeededUseVl;
     case PreferPredicateTy::PredicateElseScalarEpilogue:
       return CM_ScalarEpilogueNotNeededUsePredicate;
     case PreferPredicateTy::PredicateOrDontVectorize:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -205,13 +205,14 @@
 struct VPTransformState {
   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                    DominatorTree *DT, IRBuilderBase &Builder,
-                   InnerLoopVectorizer *ILV, VPlan *Plan)
-      : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
-        LVer(nullptr) {}
+                   InnerLoopVectorizer *ILV, VPlan *Plan, unsigned WidestTy)
+      : VF(VF), UF(UF), WidestTy(WidestTy), LI(LI), DT(DT), Builder(Builder),
+        ILV(ILV), Plan(Plan), LVer(nullptr) {}
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
   unsigned UF;
+  unsigned WidestTy;
 
   /// Hold the indices to generate specific scalar instructions. Null indicates
   /// that all instances are to be generated, using either scalar or vector
@@ -786,6 +787,7 @@
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    GetDynamicVl,
     CanonicalIVIncrement,
     CanonicalIVIncrementNUW,
     // The next two are similar to the above, but instead increment the
@@ -907,6 +909,7 @@
     default:
       return false;
     case VPInstruction::ActiveLaneMask:
+    case VPInstruction::GetDynamicVl:
     case VPInstruction::CanonicalIVIncrement:
     case VPInstruction::CanonicalIVIncrementNUW:
     case VPInstruction::CanonicalIVIncrementForPart:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -17,14 +17,17 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
@@ -222,6 +225,24 @@
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::GetDynamicVl: {
+    // Get first lane of vector induction variable.
+    Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+    // Get the original loop tripcount.
+    Value *ScalarTC = State.get(getOperand(1), Part);
+    ScalarTC = Builder.CreateIntCast(ScalarTC, VIVElem0->getType(), true);
+    Value *AvLen = Builder.CreateSub(ScalarTC, VIVElem0, "avl_length");
+    auto *IntTy = VIVElem0->getType();
+    // auto *PredTy = VectorType::get(Int1Ty, State.VF);
+    Instruction *Call = Builder.CreateIntrinsic(
+        Intrinsic::riscv_vsetvli, IntTy,
+        {AvLen, ConstantInt::get(IntTy, Log2_32(State.WidestTy / 8)),
+         ConstantInt::get(IntTy, Log2_32(State.VF.getKnownMinValue() *
+                                         State.WidestTy / 64))},
+        nullptr, Name);
+    State.set(this, Call, Part);
+    break;
+  }
   case VPInstruction::ActiveLaneMask: {
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
@@ -385,6 +406,9 @@
   case VPInstruction::ActiveLaneMask:
     O << "active lane mask";
     break;
+  case VPInstruction::GetDynamicVl:
+    O << "get dynamic vl";
+    break;
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/prefer_tail_folding_with_vsetvl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/prefer_tail_folding_with_vsetvl.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/prefer_tail_folding_with_vsetvl.ll
@@ -0,0 +1,310 @@
+; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+f   \
+; RUN:   -loop-vectorize -scalable-vectorization=on  \
+; RUN: -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue  -S < %s | \
+; RUN:   FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+; /data00/home/lizhengxian.123/company_code/llvm-dev/llvm_community/bytedance-riscv/llvm-project/build/bin/opt < tail_folding.ll -loop-vectorize -scalable-vectorization=on -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue -mtriple riscv64-linux-gnu -mattr=+v,+f -S
+define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:    prefer_folding(
+; PREFER-FOLDING: vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0)
+; PREFER-FOLDING: call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.i32({{.*}}, {{.*}}, i32 %get.dynamic.vl)
+; PREFER-FOLDING: call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.i32({{.*}}, {{.*}}, i32 %get.dynamic.vl)
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32({{.*}}, {{.*}}, i32 %get.dynamic.vl)
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+;
+; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
+; NO-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %for.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 {
+; CHECK-LABEL:        mixed_types(
+; PREFER-FOLDING:     vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0)
+; PREFER-FOLDING:     call <vscale x 2 x i16> @llvm.riscv.vle.nxv2i16.i32
+; PREFER-FOLDING:     call <vscale x 2 x i16> @llvm.riscv.vle.nxv2i16.i32
+; PREFER-FOLDING:     call void @llvm.riscv.vse.nxv2i16.i32
+; PREFER-FOLDING:     call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.i32
+; PREFER-FOLDING:     call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.i32
+; PREFER-FOLDING:     call void @llvm.riscv.vse.nxv2i32.i32
+; PREFER-FOLDING:     br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018
+  %0 = load i16, i16* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018
+  %1 = load i16, i16* %arrayidx1, align 2
+  %add = add i16 %1, %0
+  %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018
+  store i16 %add, i16* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018
+  %2 = load i32, i32* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018
+  %3 = load i32, i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %3, %2
+  %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018
+  store i32 %add7, i32* %arrayidx8, align 4
+  %add9 = add nuw nsw i32 %i.018, 1
+  %exitcond = icmp eq i32 %add9, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:    zero_extending_load_allowed(
+; PREFER-FOLDING: vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0)
+; PREFER-FOLDING: call <vscale x 2 x i8> @llvm.riscv.vle.nxv2i8.i32
+; PREFER-FOLDING: call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.i32
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %conv
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:    sign_extending_load_allowed(
+; PREFER-FOLDING: vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0)
+; PREFER-FOLDING: call <vscale x 2 x i8> @llvm.riscv.vle.nxv2i8.i32
+; PREFER-FOLDING: call <vscale x 2 x i32> @llvm.riscv.vle.nxv2i32.i32
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %conv
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:    narrowing_store_allowed(
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i8.i32
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %conv = trunc i32 %add to i8
+  %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09
+  store i8 %conv, i8* %arrayidx2, align 1
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:    half(
+; PREFER-FOLDING: vector.body:
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 1, i32 0)
+; PREFER-FOLDING: call <vscale x 4 x half> @llvm.riscv.vle.nxv4f16.i32
+; PREFER-FOLDING: call <vscale x 4 x half> @llvm.riscv.vle.nxv4f16.i32
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv4f16.i32
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
+  %0 = load half, half* %arrayidx, align 2
+  %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09
+  %1 = load half, half* %arrayidx1, align 2
+  %add = fadd fast half %1, %0
+  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
+  store half %add, half* %arrayidx2, align 2
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:    float(
+; PREFER-FOLDING: vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 1)
+; PREFER-FOLDING: call <vscale x 4 x float> @llvm.riscv.vle.nxv4f32.i32
+; PREFER-FOLDING: call <vscale x 4 x float> @llvm.riscv.vle.nxv4f32.i32
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv4f32.i32
+; PREFER-FOLDING: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; PREFER-FOLDING: %[[STEP:.*]] = mul i32 %[[VSCALE]], 4
+; PREFER-FOLDING: %index.next = add i32 %index, %[[STEP]]
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
+  %1 = load float, float* %arrayidx1, align 4
+  %add = fadd fast float %1, %0
+  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
+  store float %add, float* %arrayidx2, align 4
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
+}
+
+define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:        fpext_allowed(
+; PREFER-FOLDING:     vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0)
+; PREFER-FOLDING: call <vscale x 2 x half> @llvm.riscv.vle.nxv2f16.i32
+; PREFER-FOLDING: call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.i32
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2f32.i32
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09
+  %0 = load half, half* %arrayidx, align 2
+  %conv = fpext half %0 to float
+  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
+  %1 = load float, float* %arrayidx1, align 4
+  %add = fadd fast float %1, %conv
+  %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09
+  store float %add, float* %arrayidx2, align 4
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
+; CHECK-LABEL:        fptrunc_allowed(
+; PREFER-FOLDING:     vector.body:
+; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
+; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]]
+; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0)
+; PREFER-FOLDING: call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.i32
+; PREFER-FOLDING: call <vscale x 2 x float> @llvm.riscv.vle.nxv2f32.i32
+; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2f16.i32
+; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09
+  %1 = load float, float* %arrayidx1, align 4
+  %add = fadd fast float %1, %0
+  %conv = fptrunc float %add to half
+  %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09
+  store half %conv, half* %arrayidx2, align 2
+  %add3 = add nuw nsw i32 %i.09, 1
+  %exitcond = icmp eq i32 %add3, 431
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "target-features"="+v,+f,+experimental-zvfh,+zfh" }
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
+
+!10 = distinct !{!10, !11}
+!11 = !{!"llvm.loop.vectorize.width", i32 4}