diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5783,9 +5783,29 @@
     return ElementCount::getFixed(ConstTripCount);
   }
 
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  // The largest type limits the vectorization factor, but this can be too
+  // limiting when smaller memory operations are present, which are not
+  // legal/profitable with the chosen vectorization factor and are only
+  // profitable with larger vectorization factors.
+  //
+  // Try to detect such cases and try increasing the VF in those cases.
+  bool NarrowMemOpUnprofitable = false;
+  if (SmallestType <= 32 && SmallestType < WidestType &&
+      !MaxVectorSize.isScalable()) {
+    Type *SmallVT = FixedVectorType::get(
+        IntegerType::get(Context, SmallestType), MaxVectorSize.getFixedValue());
+    Type *SmallMaxPossibleVT =
+        FixedVectorType::get(IntegerType::get(Context, SmallestType),
+                             PowerOf2Floor(WidestRegister / SmallestType));
+    NarrowMemOpUnprofitable =
+        TTI.getMemoryOpCost(Instruction::Load, SmallVT, Align(1), 0) >
+        TTI.getMemoryOpCost(Instruction::Load, SmallMaxPossibleVT, Align(1), 0);
+  }
   ElementCount MaxVF = MaxVectorSize;
   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
-      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+      ((MaximizeBandwidth || NarrowMemOpUnprofitable) &&
+       isScalarEpilogueAllowed())) {
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorSize).
     SmallVector<ElementCount, 8> VFs;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -9,7 +9,8 @@
 ; i8 memory accesses become profitable.
 define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK-NOT: x i8>
+; CHECK: <16 x i8>
+; CHECK: <16 x i32>
 ;
 entry:
   br label %loop
@@ -33,7 +34,8 @@
 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
 define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK:     <4 x i8>
+; CHECK:     <16 x i32>
+; CHECK:     <16 x i8>
 ;
 entry:
   br label %loop
@@ -85,7 +87,8 @@
 ; vectorization factor.
 define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i64_large
-; CHECK: <2 x i64>
+; CHECK: <8 x i8>
+; CHECK: <8 x i64>
 ;
 entry:
   br label %loop