Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -925,6 +925,11 @@
   ///  architectural maximum vector length, and None otherwise.
   Optional<unsigned> getMaxVScale() const;
 
+  /// \return The maximum number of bits for a block in a scalable vector
+  /// register for a scalable vector with a vscale number of blocks.
+  /// i.e. the maximum number of N x elt bits in <vscale x N x elt>.
+  unsigned getMaxScalableBitsPerBlock() const;
+
   /// \return True if the vectorization factor should be chosen to
   /// make the vector of the smallest element type match the size of a
   /// vector register. For wider element types, this could result in
@@ -1513,6 +1518,7 @@
   virtual unsigned getRegisterBitWidth(bool Vector) const = 0;
   virtual unsigned getMinVectorRegisterBitWidth() = 0;
   virtual Optional<unsigned> getMaxVScale() const = 0;
+  virtual unsigned getMaxScalableBitsPerBlock() const = 0;
   virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0;
   virtual ElementCount getMinimumVF(unsigned ElemWidth,
                                     bool IsScalable) const = 0;
@@ -1941,6 +1947,9 @@
   Optional<unsigned> getMaxVScale() const override {
     return Impl.getMaxVScale();
   }
+  unsigned getMaxScalableBitsPerBlock() const override {
+    return Impl.getMaxScalableBitsPerBlock();
+  }
   bool shouldMaximizeVectorBandwidth(bool OptSize) const override {
     return Impl.shouldMaximizeVectorBandwidth(OptSize);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -376,6 +376,8 @@
 
   Optional<unsigned> getMaxVScale() const { return None; }
 
+  unsigned getMaxScalableBitsPerBlock() const { return 0; }
+
   bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; }
 
   ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -109,7 +109,7 @@
   void emitRemarkWithHints() const;
 
   ElementCount getWidth() const {
-    return ElementCount::get(Width.Value, isScalable());
+    return ElementCount::get(Width.Value, isForcedScalable());
   }
   unsigned getInterleave() const { return Interleave.Value; }
   unsigned getIsVectorized() const { return IsVectorized.Value; }
@@ -121,7 +121,13 @@
     return (ForceKind)Force.Value;
   }
 
-  bool isScalable() const { return Scalable.Value; }
+  bool isForcedScalable() const {
+    return Scalable.Value == LoopVectorizeHints::FK_Enabled;
+  }
+
+  bool allowScalable() const {
+    return Scalable.Value != LoopVectorizeHints::FK_Disabled;
+  }
 
   /// If hints are provided that force vectorization, use the AlwaysPrint
   /// pass name to force the frontend to print the diagnostic.
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -585,6 +585,10 @@
   return TTIImpl->getMaxVScale();
 }
 
+unsigned TargetTransformInfo::getMaxScalableBitsPerBlock() const {
+  return TTIImpl->getMaxScalableBitsPerBlock();
+}
+
 bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const {
   return TTIImpl->shouldMaximizeVectorBandwidth(OptSize);
 }
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -121,6 +121,12 @@
     return BaseT::getMaxVScale();
   }
 
+  unsigned getMaxScalableBitsPerBlock() const {
+    if (ST->hasSVE())
+      return AArch64::SVEBitsPerBlock;
+    return BaseT::getMaxScalableBitsPerBlock();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -51,6 +51,21 @@
     cl::desc("The maximum number of SCEV checks allowed with a "
              "vectorize(enable) pragma"));
 
+// FIXME: When scalable vectorization is stable enough, change the default
+// to FK_Undefined.
+static cl::opt<LoopVectorizeHints::ForceKind> ScalableVectorization(
+    "scalable-vectorization", cl::init(LoopVectorizeHints::FK_Disabled),
+    cl::Hidden,
+    cl::desc("Control whether the compiler can use scalable vectors to "
+             "vectorize a loop"),
+    cl::values(
+        clEnumValN(LoopVectorizeHints::FK_Disabled, "off",
+                   "disable all vectorization with scalable vectors"),
+        clEnumValN(LoopVectorizeHints::FK_Undefined, "on",
+                   "allow loops to be vectorized with scalable vectors"),
+        clEnumValN(LoopVectorizeHints::FK_Enabled, "always",
+                   "allow loops to be vectorized exclusively with scalable vectors")));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -63,10 +78,10 @@
   case HK_UNROLL:
     return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
   case HK_FORCE:
+  case HK_SCALABLE:
     return (Val <= 1);
   case HK_ISVECTORIZED:
   case HK_PREDICATE:
-  case HK_SCALABLE:
     return (Val == 0 || Val == 1);
   }
   return false;
@@ -80,8 +95,8 @@
       Force("vectorize.enable", FK_Undefined, HK_FORCE),
       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
-      Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
-      ORE(ORE) {
+      Scalable("vectorize.scalable.enable", ScalableVectorization, HK_SCALABLE),
+      TheLoop(L), ORE(ORE) {
   // Populate values with existing loop metadata.
   getHintsFromMetadata();
 
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5578,6 +5578,21 @@
     auto MaxSafeElements = ElementCount::getFixed(
         PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType));
 
+    // Try to find a max scalable VF.
+    if (Hints->allowScalable() && TTI.supportsScalableVectors()) {
+      auto MaxSafeVF = clampFeasibleMaxVF(ElementCount::getScalable(1 << 16),
+                                          MaxSafeElements);
+      if (MaxSafeVF.isScalableVector()) {
+        ElementCount MaxScalableVF =
+            computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType);
+        if (MaxScalableVF.isScalable())
+          LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = "
+                            << MaxScalableVF << "\n");
+        else
+          LLVM_DEBUG(dbgs() << "LV: No feasible scalable VF found.\n");
+      }
+    }
+
     // First analyze the UserVF, fall back if the UserVF should be ignored.
     if (auto MaybeMaxVF = getFeasibleUserVF(UserVF, MaxSafeElements))
       return MaybeMaxVF.getValue();
@@ -5805,16 +5820,21 @@
 ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF(
     unsigned ConstTripCount, ElementCount MaxSafeVF, unsigned SmallestType,
     unsigned WidestType) {
-  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
+  unsigned WidestRegister = ComputeScalableMaxVF
+                                ? TTI.getMaxScalableBitsPerBlock()
+                                : TTI.getRegisterBitWidth(true);
+
   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
                     << " / " << WidestType << " bits.\n");
   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+                    << (ComputeScalableMaxVF ? "vscale x " : "")
                     << WidestRegister << " bits.\n");
 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
   // Note that both WidestRegister and WidestType may not be a powers of 2.
-  auto MaxVectorSize =
-      ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
+  auto MaxVectorSize = ElementCount::get(
+      PowerOf2Floor(WidestRegister / WidestType), ComputeScalableMaxVF);
 
   if (MaxVectorSize.getKnownMinValue() == 0) {
     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
@@ -5824,6 +5844,14 @@
   if (!Legal->isSafeForAnyVectorWidth())
     MaxVectorSize = clampFeasibleMaxVF(MaxVectorSize, MaxSafeVF);
 
+  // Test that the loop-vectorizer can legalize all operations for this MaxVF.
+  // FIXME: While for scalable vectors this is currently sufficient, this should
+  // be replaced by a more detailed mechanism that filters out specific VFs,
+  // instead of invalidating vectorization for a whole set of VFs based on the
+  // MaxVF.
+  if (MaxVectorSize.isScalable() && !canVectorizeReductions(MaxVectorSize))
+    return ElementCount::getFixed(1);
+
   if (ConstTripCount && isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
     // choosing a higher viable VF as done in the loop below.
@@ -5839,8 +5867,8 @@
   ElementCount MaxVF = MaxVectorSize;
   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
-    auto MaxVectorSizeMaxBW =
-        ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType));
+    auto MaxVectorSizeMaxBW = ElementCount::get(
+        PowerOf2Floor(WidestRegister / SmallestType), ComputeScalableMaxVF);
     if (!Legal->isSafeForAnyVectorWidth())
       MaxVectorSizeMaxBW = clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeVF);
 
@@ -5869,7 +5897,7 @@
       }
     }
     if (ElementCount MinVF =
-            TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
+            TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
                           << ") with target's minimum: " << MinVF << '\n');
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
@@ -0,0 +1,124 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ON
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=always -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_ALWAYSON
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=off -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_DISABLED
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -scalable-vectorization=on -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_MAXBW
+
+; Test that the MaxVF for the following loop, that has no dependence distances,
+; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
+; (maximized bandwidth for i8 in the loop).
+define void @test0(i32* %a, i8* %b, i32* %c) {
+; CHECK: LV: Checking a loop in "test0"
+; CHECK_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 16
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+  %1 = load i8, i8* %arrayidx2, align 4
+  %zext = zext i8 %1 to i32
+  %add = add nsw i32 %zext, %0
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+; Test that the MaxVF for the following loop, with a dependence distance
+; of 64 elements, is calculated as (maxvscale = 16) * 4.
+define void @test1(i32* %a, i8* %b) {
+; CHECK: LV: Checking a loop in "test1"
+; CHECK_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 4
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+  %1 = load i8, i8* %arrayidx2, align 4
+  %zext = zext i8 %1 to i32
+  %add = add nsw i32 %zext, %0
+  %2 = add nuw nsw i64 %iv, 64
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test that the MaxVF for the following loop, with a dependence distance
+; of 32 elements, is calculated as (maxvscale = 16) * 2.
+define void @test2(i32* %a, i8* %b) {
+; CHECK: LV: Checking a loop in "test2"
+; CHECK_ON: LV: Found feasible scalable VF = vscale x 2
+; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 2
+; CHECK_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 2
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+  %1 = load i8, i8* %arrayidx2, align 4
+  %zext = zext i8 %1 to i32
+  %add = add nsw i32 %zext, %0
+  %2 = add nuw nsw i64 %iv, 32
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Test that the MaxVF for the following loop, with a dependence distance
+; of 16 elements, is calculated as (maxvscale = 16) * 1.
+define void @test3(i32* %a, i8* %b) {
+; CHECK: LV: Checking a loop in "test3"
+; CHECK_ON: LV: Found feasible scalable VF = vscale x 1
+; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 1
+; CHECK_DISABLED-NOT: LV: Found feasible scalable VF
+; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 1
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+  %1 = load i8, i8* %arrayidx2, align 4
+  %zext = zext i8 %1 to i32
+  %add = add nsw i32 %zext, %0
+  %2 = add nuw nsw i64 %iv, 16
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}