diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5540,13 +5540,34 @@
   }
 
   ElementCount MaxVF = MaxVectorElementCount;
+  // The largest type limits the vectorization factor, but this can be too
+  // limiting when smaller memory operations are present, which are not
+  // legal/profitable with the chosen vectorization factor and are only
+  // profitable with larger vectorization factors.
+  //
+  // Try to detect such cases and try increasing the VF in those cases.
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  bool NarrowMemOpUnprofitable = false;
+  if (SmallestType < WidestType) {
+    Type *SmallVT =
+        VectorType::get(IntegerType::get(Context, SmallestType), MaxVF);
+    unsigned MaxVFForSmallTy = PowerOf2Floor(
+        WidestRegister.divideCoefficientBy(SmallestType).getKnownMinValue());
+    ;
+    Type *SmallMaxPossibleVT =
+        VectorType::get(IntegerType::get(Context, SmallestType),
+                        MaxVFForSmallTy, MaxVF.isScalable());
+    NarrowMemOpUnprofitable =
+        TTI.getMemoryOpCost(Instruction::Load, SmallVT, Align(1), 0) >
+        TTI.getMemoryOpCost(Instruction::Load, SmallMaxPossibleVT, Align(1), 0);
+  }
   if (TTI.shouldMaximizeVectorBandwidth() ||
-      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+      ((MaximizeBandwidth || NarrowMemOpUnprofitable) &&
+       isScalarEpilogueAllowed())) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
         ComputeScalableMaxVF);
     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
-
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorElementCount).
     SmallVector<ElementCount, 8> VFs;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -8,7 +8,8 @@
 ; load 4 x i8, vectorization might still be profitable.
 define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK:       <4 x i8>
+; CHECK: <16 x i8>
+; CHECK: <16 x i32>
 ;
 entry:
   br label %loop
@@ -32,7 +33,8 @@
 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
 define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK:     <4 x i8>
+; CHECK:     <16 x i32>
+; CHECK:     <16 x i8>
 ;
 entry:
   br label %loop
@@ -84,7 +86,8 @@
 ; vectorization factor.
 define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i64_large
-; CHECK: <2 x i64>
+; CHECK: <8 x i8>
+; CHECK: <8 x i64>
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -28,8 +28,8 @@
 ; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
 ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
 
-; VF-4: <4 x i32>
-; VF-VSCALE4: <vscale x 4 x i32>
+; VF-4: <16 x i32>
+; VF-VSCALE4: <16 x i32>
 define void @test0(i32* %a, i8* %b, i32* %c) #0 {
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -9,9 +9,9 @@
 define void @test0(i32* %a, i8* %b, i32* %c) #0 {
 ; CHECK: LV: Checking a loop in "test0"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16
 entry:
@@ -40,9 +40,9 @@
 define void @test1(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test1"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
 entry:
@@ -72,9 +72,9 @@
 define void @test2(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test2"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
 entry:
@@ -104,9 +104,9 @@
 define void @test3(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test3"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
-; CHECK_SCALABLE_ON: LV: Selecting VF: 4
+; CHECK_SCALABLE_ON: LV: Selecting VF: 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4
+; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16
 entry: