Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -927,6 +927,10 @@
   /// \return The width of the smallest vector register type.
   unsigned getMinVectorRegisterBitWidth() const;
 
+  /// \return The maximum value for vscale in scalable vectors such as
+  /// <vscale x 4 x i32>.
+  unsigned getMaxVScale() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1495,6 +1499,7 @@
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
+  virtual unsigned getMaxVScale() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0;
   virtual bool shouldConsiderAddressTypePromotion(
@@ -1910,6 +1915,7 @@
   unsigned getMinVectorRegisterBitWidth() override {
     return Impl.getMinVectorRegisterBitWidth();
   }
+  unsigned getMaxVScale() const override { return Impl.getMaxVScale(); }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -352,6 +352,8 @@
 
   unsigned getMinVectorRegisterBitWidth() { return 128; }
 
+  unsigned getMaxVScale() const { return 1; }
+
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   unsigned getMinimumVF(unsigned ElemWidth) const { return 0; }
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -567,6 +567,8 @@
 
   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
 
+  unsigned getMaxVScale() const { return 1; }
+
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -626,6 +626,10 @@
   return TTIImpl->getMinVectorRegisterBitWidth();
 }
 
+unsigned TargetTransformInfo::getMaxVScale() const {
+  return TTIImpl->getMaxVScale();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -115,6 +115,12 @@
     return ST->getMinVectorRegisterBitWidth();
   }
 
+  unsigned getMaxVScale() const {
+    if (ST->hasSVE())
+      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
+    return BaseT::getMaxVScale();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5394,10 +5394,34 @@
   // dependence distance).
   unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
 
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
   if (UserVF.isNonZero()) {
-    // If legally unsafe, clamp the user vectorization factor to a safe value.
-    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeRegisterWidth / WidestType);
-    if (UserVF.getKnownMinValue() <= MaxSafeVF)
+    // Nothing to do if there are no dependencies.
+    if (MaxSafeVectorWidthInBits >= UINT_MAX)
+      return UserVF;
+
+    unsigned MaxSafeElements =
+        PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    // If scalable, scale VF by vscale before checking if it's safe.
+    ElementCount MaxSafeVF =
+        UserVF.isScalable()
+            ? ElementCount::getScalable(MaxSafeElements / TTI.getMaxVScale())
+            : ElementCount::getFixed(MaxSafeElements);
+
+    if (UserVF.isScalable() && MaxSafeVF.isZero()) {
+      // Dependence distance too small to use scalable vectors. Clamp to max
+      // fixed VF.
+      LLVM_DEBUG(
+          dbgs()
+          << "LV: Max legal vector width too small, scalable vectorization "
+             "unfeasible. Using fixed-width vectorization instead.\n");
+      MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+    }
+
+    LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+    if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
       return UserVF;
 
     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
@@ -5412,7 +5436,7 @@
              << " is unsafe, clamping to maximum safe vectorization factor "
              << ore::NV("VectorizationFactor", MaxSafeVF);
     });
-    return ElementCount::getFixed(MaxSafeVF);
+    return MaxSafeVF;
   }
 
   WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
@@ -7089,18 +7113,20 @@
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
 
-  if (!UserVF.isZero() &&
-      UserVF.getKnownMinValue() <= MaxVF.getKnownMinValue()) {
-    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+  if (!UserVF.isZero()) {
+    bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
+    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
+                      << " VF " << VF << ".\n");
+    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    CM.selectUserVectorizationFactor(UserVF);
+    CM.selectUserVectorizationFactor(VF);
     CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(UserVF, UserVF);
+    buildVPlansWithVPRecipes(VF, VF);
     LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    return {{VF, 0}};
   }
 
   assert(!MaxVF.isScalable() &&
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -0,0 +1,241 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -S < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck --check-prefix=CHECK-DBG %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; test1
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there
+; would be a dependence between vector lanes for vectors greater than 128 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, for this example
+; unless max(vscale)=1 it's unsafe to vectorize. For SVE max(vscale)=16, check
+; fixed-width vectorization is used instead.
+
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 4.
+; CHECK-DBG: LV: Using max VF 4.
+; CHECK-LABEL: @test1
+; CHECK: <4 x i32>
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.width", !3}
+!3 = !{i32 4, i1 true}
+
+; test2
+;
+; Specifies a vector of <vscale x 2 x i32>, i.e. maximum of 32 x i32 with 2
+; words per 128-bits (unpacked).
+;
+; void test2(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(2, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test2"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: Using user VF vscale x 2.
+; CHECK-LABEL: @test2
+; CHECK: <vscale x 2 x i32>
+define void @test2(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !4
+
+exit:
+  ret void
+}
+
+!4 = !{!4, !5, !6}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = !{!"llvm.loop.vectorize.width", !7}
+!7 = !{i32 2, i1 true}
+
+; test3
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test3(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 32] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+; CHECK-DBG: LV: Using max VF vscale x 2.
+; CHECK-LABEL: @test3
+; CHECK: <vscale x 2 x i32>
+define void @test3(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !8
+
+exit:
+  ret void
+}
+
+!8 = !{!8, !9, !10}
+!9 = !{!"llvm.loop.vectorize.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.width", !11}
+!11 = !{i32 4, i1 true}
+
+; test4
+;
+; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
+; words per 128-bits (packed).
+;
+; void test4(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: Using user VF vscale x 4
+; CHECK-LABEL: @test4
+; CHECK: <vscale x 4 x i32>
+define void @test4(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !12
+
+exit:
+  ret void
+}
+
+!12 = !{!12, !13, !14}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !{!"llvm.loop.vectorize.width", !15}
+!15 = !{i32 4, i1 true}
+
+; test5
+;
+; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
+;
+; void test5(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(16, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 128] = a[i] + b[i];
+;   }
+; }
+;
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
+
+; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+; CHECK-DBG: LV: Using max VF vscale x 8
+; CHECK-LABEL: @test5
+; CHECK: <vscale x 8 x i32>
+define void @test5(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 128
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !16
+
+exit:
+  ret void
+}
+
+!16 = !{!16, !17, !18}
+!17 = !{!"llvm.loop.vectorize.enable", i1 true}
+!18 = !{!"llvm.loop.vectorize.width", !19}
+!19 = !{i32 16, i1 true}
Index: llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -0,0 +1,52 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; test1
+;
+; The pragma applied to this loop implies a scalable vector <vscale x 4 x i32>
+; be used for vectorization. For fixed vectors the MaxVF=4, otherwise there
+; would be a dependence between vector lanes for vectors greater than 128 bits.
+;
+; void test1(int *a, int *b, int N) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4, scalable)
+;   for (int i=0; i<N; ++i) {
+;     a[i + 4] = a[i] + b[i];
+;   }
+; }
+;
+; For scalable vectorization 'vscale' has to be considered, in this example
+; vectorization is only safe when vscale=1. The default max vscale is 1 unless
+; a target specifies otherwise, therefore it is safe to vectorize this loop.
+
+; CHECK-LABEL: LV: Checking a loop in "test1"
+; CHECK: The max safe VF is: vscale x 4.
+; CHECK: Using user VF vscale x 4.
+; CHECK: <vscale x 4 x i32>
+define void @test1(i32* %a, i32* %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.width", !3}
+!3 = !{i32 4, i1 true}