Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1608,14 +1608,26 @@
   /// \return UserVF directly if it is valid. Otherwise clamp UserVF to the
   /// largest valid value.
   Optional<ElementCount> getFeasibleUserVF(ElementCount UserVF,
-                                           unsigned MaxSafeElements);
+                                           ElementCount MaxSafeElements);
 
   /// \return An upper bound for the vectorization factor, a power-of-2 larger
-  /// than zero. One is returned if vectorization should best be avoided due
-  /// to cost.
+  /// than zero, limited by \p MaxSafeVF. If \p MaxSafeVF is scalable, the
+  /// computed feasible max VF will be scalable as well. One (scalar) is
+  /// returned if vectorization should best be avoided due to cost.
   ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+                                    ElementCount MaxSafeVF,
                                     unsigned SmallestType, unsigned WidestType);
 
+  /// \return the \p SuggestedVF if it is less than or equal to \p MaxSafeVF,
+  /// otherwise the value is clamped to MaxSafeVF. If \p SuggestedVF is
+  /// scalable, and \p MaxSafeVF is not, then it uses MaxVScale to determine
+  /// whether it can use a smaller scalable VF. Otherwise it clamps to a
+  /// fixed-width VF. If \p GetMaxSafeVF is not None, then the calculated
+  /// maximum safe VF will written to the passed address.
+  ElementCount clampFeasibleMaxVF(ElementCount SuggestedVF,
+                                  ElementCount MaxSafeVF,
+                                  Optional<ElementCount *> GetMaxSafeVF = None);
+
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -5563,13 +5575,17 @@
     // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
     // the memory accesses that is most restrictive (involved in the smallest
     // dependence distance).
-    unsigned MaxSafeElements =
-        PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+    auto MaxSafeElements = ElementCount::getFixed(
+        PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType));
 
     // First analyze the UserVF, fall back if the UserVF should be ignored.
     if (auto MaybeMaxVF = getFeasibleUserVF(UserVF, MaxSafeElements))
       return MaybeMaxVF.getValue();
-    return computeFeasibleMaxVF(TC, SmallestType, WidestType);
+
+    // Try to automatically determine a suitable maximum VF.
+    auto MaxSafeVF =
+        clampFeasibleMaxVF(ElementCount::getFixed(1 << 16), MaxSafeElements);
+    return computeFeasibleMaxVF(TC, MaxSafeVF, SmallestType, WidestType);
   };
 
   switch (ScalarEpilogueStatus) {
@@ -5691,9 +5707,35 @@
   return None;
 }
 
+ElementCount LoopVectorizationCostModel::clampFeasibleMaxVF(
+    ElementCount SuggestedVF, ElementCount ClampValue,
+    Optional<ElementCount *> OutputMaxValidVF) {
+  assert((SuggestedVF.isScalable() || !ClampValue.isScalable()) &&
+         "Cannot clamp a fixed-width VF to a scalable VF");
+
+  ElementCount MaxVF = ClampValue;
+  if (SuggestedVF.isScalable() && !ClampValue.isScalable()) {
+    Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+    // Scale VF by vscale before checking if it's safe.
+    MaxVF = ElementCount::getScalable(
+        MaxVScale ? (ClampValue.getFixedValue() / MaxVScale.getValue()) : 0);
+
+    // Fall back on fixed-width VF.
+    if (MaxVF.isZero())
+      return clampFeasibleMaxVF(
+          ElementCount::getFixed(SuggestedVF.getKnownMinValue()), ClampValue,
+          OutputMaxValidVF);
+  }
+
+  if (OutputMaxValidVF)
+    **OutputMaxValidVF = MaxVF;
+  return ElementCount::isKnownLE(SuggestedVF, MaxVF) ? SuggestedVF : MaxVF;
+}
+
 Optional<ElementCount>
 LoopVectorizationCostModel::getFeasibleUserVF(ElementCount UserVF,
-                                              unsigned MaxSafeElements) {
+                                              ElementCount MaxSafeElements) {
   if (UserVF.isZero())
     return None;
 
@@ -5725,97 +5767,86 @@
   if (Legal->isSafeForAnyVectorWidth())
     return UserVF;
 
-  ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
-
-  if (UserVF.isScalable()) {
-    Optional<unsigned> MaxVScale = TTI.getMaxVScale();
-
-    // Scale VF by vscale before checking if it's safe.
-    MaxSafeVF = ElementCount::getScalable(
-        MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
-
-    if (MaxSafeVF.isZero()) {
-      // The dependence distance is too small to use scalable vectors,
-      // fallback on fixed.
-      LLVM_DEBUG(
-          dbgs()
-          << "LV: Max legal vector width too small, scalable vectorization "
-             "unfeasible. Using fixed-width vectorization instead.\n");
-      ORE->emit([&]() {
-        return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
-                                          TheLoop->getStartLoc(),
-                                          TheLoop->getHeader())
-               << "Max legal vector width too small, scalable vectorization "
-               << "unfeasible. Using fixed-width vectorization instead.";
-      });
-      return getFeasibleUserVF(
-          ElementCount::getFixed(UserVF.getKnownMinValue()), MaxSafeElements);
-    }
-  }
+  // If the user vectorization factor is legally unsafe, clamp it to a safe
+  // value. Otherwise, return as is.
+  ElementCount MaxSafeVF;
+  ElementCount NewVF = clampFeasibleMaxVF(UserVF, MaxSafeElements, &MaxSafeVF);
 
+  // Emit some useful debug output / opt remarks if the user value is clamped.
   LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
-  if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
-    return UserVF;
-
-  LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
-                    << " is unsafe, clamping to max safe VF=" << MaxSafeVF
-                    << ".\n");
-  ORE->emit([&]() {
-    return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
-                                      TheLoop->getStartLoc(),
-                                      TheLoop->getHeader())
-           << "User-specified vectorization factor "
-           << ore::NV("UserVectorizationFactor", UserVF)
-           << " is unsafe, clamping to maximum safe vectorization factor "
-           << ore::NV("VectorizationFactor", MaxSafeVF);
-  });
-  return MaxSafeVF;
+  if (UserVF.isScalable() != NewVF.isScalable()) {
+    auto Diag = "Max legal vector width too small, scalable vectorization "
+                "unfeasible. Using fixed-width vectorization instead.";
+    LLVM_DEBUG(dbgs() << "LV: " << Diag << "\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << Diag;
+    });
+  }
+  if (NewVF.getKnownMinValue() != UserVF.getKnownMinValue()) {
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+                      << ".\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "User-specified vectorization factor "
+             << ore::NV("UserVectorizationFactor", UserVF)
+             << " is unsafe, clamping to maximum safe vectorization factor "
+             << ore::NV("VectorizationFactor", MaxSafeVF);
+    });
+  }
+  return NewVF;
 }
 
 ElementCount LoopVectorizationCostModel::computeFeasibleMaxVF(
-    unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType) {
-  // Get the maximum safe dependence distance in bits computed by LAA.
-  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
-  // the memory accesses that is most restrictive (involved in the smallest
-  // dependence distance).
-  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
-
+    unsigned ConstTripCount, ElementCount MaxSafeVF, unsigned SmallestType,
+    unsigned WidestType) {
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
-  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
+  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+                    << " / " << WidestType << " bits.\n");
+  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+                    << WidestRegister << " bits.\n");
 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
   // Note that both WidestRegister and WidestType may not be a powers of 2.
   auto MaxVectorSize =
       ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
 
-  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
-                    << " / " << WidestType << " bits.\n");
-  LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
-                    << WidestRegister << " bits.\n");
-
-  assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
-         "Did not expect to pack so many elements"
-         " into one vector!");
-  if (MaxVectorSize.getFixedValue() == 0) {
+  if (MaxVectorSize.getKnownMinValue() == 0) {
     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
     return ElementCount::getFixed(1);
-  } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
-             isPowerOf2_32(ConstTripCount)) {
+  }
+
+  if (!Legal->isSafeForAnyVectorWidth())
+    MaxVectorSize = clampFeasibleMaxVF(MaxVectorSize, MaxSafeVF);
+
+  if (ConstTripCount && isPowerOf2_32(ConstTripCount)) {
     // We need to clamp the VF to be the ConstTripCount. There is no point in
     // choosing a higher viable VF as done in the loop below.
-    LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
-                      << ConstTripCount << "\n");
-    return ElementCount::getFixed(ConstTripCount);
+    ElementCount ClampedVF = clampFeasibleMaxVF(
+        MaxVectorSize, ElementCount::getFixed(ConstTripCount));
+    if (ClampedVF != MaxVectorSize) {
+      LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+                        << ConstTripCount << "\n");
+      return ClampedVF;
+    }
   }
 
   ElementCount MaxVF = MaxVectorSize;
   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+    auto MaxVectorSizeMaxBW =
+        ElementCount::getFixed(PowerOf2Floor(WidestRegister / SmallestType));
+    if (!Legal->isSafeForAnyVectorWidth())
+      MaxVectorSizeMaxBW = clampFeasibleMaxVF(MaxVectorSizeMaxBW, MaxSafeVF);
+
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
     SmallVector<ElementCount, 8> VFs;
-    auto MaxVectorSizeMaxBW =
-        ElementCount::getFixed(WidestRegister / SmallestType);
     for (ElementCount VS = MaxVectorSize * 2;
          ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
       VFs.push_back(VS);
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -37,9 +37,9 @@
 ; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
 ; fixed-width vectorization is used instead.
 
+; CHECK-DBG: LV: The max safe VF is: 8.
 ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
 ; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
-; CHECK-DBG: LV: The max safe VF is: 8.
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test1
 ; CHECK: <4 x i32>
@@ -80,9 +80,9 @@
 ;   }
 ; }
 
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
 ; CHECK-DBG: LV: The max safe VF is: 4.
-; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: User VF=vscale x 8 is unsafe, clamping to max safe VF=4.
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test2
 ; CHECK: <4 x i32>
@@ -337,8 +337,8 @@
 ; supported but max vscale is undefined.
 ;
 ; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
-; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
 ; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
+; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
 ; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
 ; CHECK-NO-MAX-VSCALE: <4 x i32>
 define void @test_no_max_vscale(i32* %a, i32* %b) {