Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -928,6 +928,10 @@
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return The maximum value for vscale in scalable vectors such as
+  /// <vscale x 4 x i32>. Default is None.
+  Optional<unsigned> getMaxVScale() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1504,6 +1508,7 @@
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual Optional<unsigned> getMaxVScale() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0;
@@ -1921,6 +1926,9 @@
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  Optional<unsigned> getMaxVScale() const override {
+    return Impl.getMaxVScale();
+  }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -352,6 +352,8 @@
 
   unsigned getMinVectorRegisterBitWidth() { return 128; }
 
+  llvm::Optional<unsigned> getMaxVScale() const { return None; }
+
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -571,6 +571,8 @@
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  Optional<unsigned> getMaxVScale() const { return None; }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -627,6 +627,10 @@
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+llvm::Optional<unsigned> TargetTransformInfo::getMaxVScale() const {
+  return TTIImpl->getMaxVScale();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -115,6 +115,12 @@
     return ST->getMinVectorRegisterBitWidth();
   }
 
+  Optional<unsigned> getMaxVScale() const {
+    if (ST->hasSVE())
+      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+    return BaseT::getMaxVScale();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -272,6 +272,12 @@
              "an instruction to a single constant value. Mostly "
              "useful for getting consistent testing."));
 
+static cl::opt<bool> ForceTargetSupportsScalableVectors(
+    "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Allow scalable vectorization for targets that don't support scalable "
+        "vectors."));
+
 static cl::opt<unsigned> SmallLoopCost(
     "small-loop-cost", cl::init(20), cl::Hidden,
     cl::desc(
@@ -5562,15 +5568,63 @@
   // dependence distance).
   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
 
-  if (UserVF.isNonZero()) {
-    // For now, don't verify legality of scalable vectors.
-    // This will be addressed properly in https://reviews.llvm.org/D91718.
-    if (UserVF.isScalable())
+  bool IgnoreUserVF = UserVF.isScalable() && !TTI.supportsScalableVectors() &&
+                      !ForceTargetSupportsScalableVectors;
+  if (IgnoreUserVF) {
+    LLVM_DEBUG(
+        dbgs() << "LV: Ignoring VF=" << UserVF
+               << " because target does not support scalable vectors.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreUserVF",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+             << " because target does not support scalable vectors.";
+    });
+  }
+
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
+  if (UserVF.isNonZero() && !IgnoreUserVF) {
+    // Nothing to do if there are no dependencies.
+    if (MaxSafeVectorWidthInBits == UINT_MAX)
       return UserVF;
 
-    // If legally unsafe, clamp the user vectorization factor to a safe value.
-    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
-    if (UserVF.getFixedValue() <= MaxSafeVF)
+    unsigned MaxSafeElements =
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+
+    if (UserVF.isScalable()) {
+      Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+      assert(MaxVScale &&
+             "max vscale undefined for target that supports scalable vectors!");
+
+      // Scale VF by vscale before checking if it's safe.
+      MaxSafeVF =
+          ElementCount::getScalable(MaxSafeElements / MaxVScale.getValue());
+
+      if (MaxSafeVF.isZero()) {
+        // The dependence distance is too small to use scalable vectors,
+        // fallback on fixed.
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Max legal vector width too small, scalable vectorization "
+               "unfeasible. Using fixed-width vectorization instead.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+                                            TheLoop->getStartLoc(),
+                                            TheLoop->getHeader())
+                 << "Max legal vector width too small, scalable vectorization "
+                 << "unfeasible. Using fixed-width vectorization instead.";
+        });
+        return computeFeasibleMaxVF(
+            ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
       return UserVF;
 
     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
@@ -5585,7 +5639,7 @@
              << " is unsafe, clamping to maximum safe vectorization factor "
              << ore::NV("VectorizationFactor", MaxSafeVF);
     });
-    return ElementCount::getFixed(MaxSafeVF);
+    return MaxSafeVF;
   }
 
   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
@@ -7385,17 +7439,24 @@
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
 
-  if (!UserVF.isZero() && ElementCount::isKnownLE(UserVF, MaxVF)) {
-    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+  if (!UserVF.isZero() &&
+      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
+    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
+    // VFs here, this should be reverted to only use legal UserVFs once the
+    // loop below supports scalable VFs.
+    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+                      << " VF " << VF << ".\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    CM.selectUserVectorizationFactor(UserVF);
+    CM.selectUserVectorizationFactor(VF);
     CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(UserVF, UserVF);
+    buildVPlansWithVPRecipes(VF, VF);
     LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    return {{VF, 0}};
   }
 
   assert(!MaxVF.isScalable() &&
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-loop-unpredicated-body-scalar-tail.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -0,0 +1,101 @@
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
+
+; CHECKUF1: for.body.preheader:
+; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
+
+; CHECKUF1: vector.ph:
+; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
+; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECKUF1: vector.body:
+; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]]
+; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+
+; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
+; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
+
+; CHECKUF2: for.body.preheader:
+; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
+
+; CHECKUF2: vector.ph:
+; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
+; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECKUF2: vector.body:
+; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
+; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]]
+; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to <vscale x 4 x double>*
+; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
+; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
+; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]]
+; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to <vscale x 4 x double>*
+; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]]
+; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -0,0 +1,333 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SVE %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; These tests validate the behaviour of scalable vectorization factor hints,
+; where the following applies:
+;
+; * If the backend does not support scalable vectors, ignore the hint and let
+;   the vectorizer pick a VF.
+; * If there are no dependencies and assuming the VF is a power of 2 the VF
+;   should be accepted. This applies to both fixed and scalable VFs.
+; * If the dependency is too small to use scalable vectors, change the VF to
+;   fixed, where existing behavior applies (clamping).
+; * If scalable vectorization is feasible given the dependency and the VF is
+;   valid, accept it. Otherwise, clamp to the max scalable VF.
+
+; test1
+;
+; Scalable vectorization unfeasible, clamp VF from (4, scalable) -> (4, fixed).
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=8, otherwise there
+; would be a dependence between vector lanes for vectors greater than 256 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 8] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, for this example
+; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
+; fixed-width vectorization is used instead.
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 8.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test1
+; CHECK: <4 x i32>
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test2
+;
+; Scalable vectorization unfeasible, clamp VF from (8, scalable) -> (4, fixed).
+;
+; void test2(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(8, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 4.
+; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test2
+; CHECK: <4 x i32>
+define void @test2(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !3
+
+exit:
+  ret void
+}
+
+!3 = !{!3, !4, !5}
+!4 = !{!"llvm.loop.vectorize.width", i32 8}
+!5 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test3
+;
+; Scalable vectorization feasible and the VF is valid.
+;
+; Specifies a vector of <vscale x 2 x i32>, i.e. maximum of 32 x i32 with 2
+; words per 128-bits (unpacked).
+;
+; void test3(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(2, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: Using user VF vscale x 2.
+; CHECK-LABEL: @test3
+; CHECK: <vscale x 2 x i32>
+define void @test3(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !6
+
+exit:
+  ret void
+}
+
+!6 = !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 2}
+!8 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test4
+;
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test4(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+; CHECK-DBG: LV: Using max VF vscale x 2
+; CHECK-LABEL: @test4
+; CHECK: <vscale x 2 x i32>
+define void @test4(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !9
+
+exit:
+  ret void
+}
+
+!9 = !{!9, !10, !11}
+!10 = !{!"llvm.loop.vectorize.width", i32 4}
+!11 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test5
+;
+; Scalable vectorization feasible and the VF is valid.
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test5(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: Using user VF vscale x 4
+; CHECK-LABEL: @test5
+; CHECK: <vscale x 4 x i32>
+define void @test5(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12
+
+exit:
+  ret void
+}
+
+!12 = !{!12, !13, !14}
+!13 = !{!"llvm.loop.vectorize.width", i32 4}
+!14 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; test6
+;
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+;
+; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
+;
+; void test6(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(16, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+; CHECK-DBG: LV: Using max VF vscale x 8
+; CHECK-LABEL: @test6
+; CHECK: <vscale x 8 x i32>
+define void @test6(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !15
+
+exit:
+  ret void
+}
+
+!15 = !{!15, !16, !17}
+!16 = !{!"llvm.loop.vectorize.width", i32 16}
+!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: LV: Selecting VF: 4.
+; CHECK-NO-SVE: <4 x i32>
+define void @test_no_sve(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !18
+
+exit:
+  ret void
+}
+
+!18 = !{!18, !19, !20}
+!19 = !{!"llvm.loop.vectorize.width", i32 4}
+!20 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/metadata-width.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-target-supports-scalable-vectors=true -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
+++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-limitations.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s  -passes='loop-vectorize' -force-vector-width=2 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s  -passes='loop-vectorize' -force-vector-width=2 -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512"
 
Index: llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
-; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
-
-; CHECKUF1: for.body.preheader:
-; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
-; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
-; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
-
-; CHECKUF1: vector.ph:
-; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
-; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
-; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
-
-; CHECKUF1: vector.body:
-; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
-; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
-; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
-; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
-; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
-; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
-; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
-; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
-; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]]
-; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
-; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
-
-
-; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
-; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
-
-; CHECKUF2: for.body.preheader:
-; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
-; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
-; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
-
-; CHECKUF2: vector.ph:
-; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
-; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
-; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
-
-; CHECKUF2: vector.body:
-; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
-; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
-; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
-; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
-; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
-; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]]
-; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to <vscale x 4 x double>*
-; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
-; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
-; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
-; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
-; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
-; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
-; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
-; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
-; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]]
-; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to <vscale x 4 x double>*
-; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
-; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
-; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
-; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]]
-; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
-; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
-
-define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
-entry:
-  %cmp7 = icmp sgt i32 %N, 0
-  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  %wide.trip.count = zext i32 %N to i64
-  br label %for.body
-
-for.cond.cleanup:                                 ; preds = %for.body, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 8
-  %add = fadd double %0, 1.000000e+00
-  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
-  store double %add, double* %arrayidx2, align 8
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
-}
-
-!1 = distinct !{!1, !2, !3}
-!2 = !{!"llvm.loop.vectorize.width", i32 4}
-!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: LV: The Widest register safe to use is: 32 bits.
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}