Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -135,6 +135,12 @@
     return ST->getVScaleForTuning();
   }
 
+  bool shouldMaximizeVectorBandwidth() const {
+    if (ST->hasSVE())
+      return false;
+    return true;
+  }
+
   /// Try to return an estimate cost factor that can be used as a multiplier
   /// when scalarizing an operation for a vector with ElementCount \p VF.
   /// For scalable vectors this currently takes the most pessimistic view based
Index: llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll
@@ -4,11 +4,12 @@
 ; are not profitable.
 
 ; Test with a loop that contains memory accesses of i8 and i32 types. The
-; default maximum VF for NEON is 4. And while we don't have an instruction to
-; load 4 x i8, vectorization might still be profitable.
+; maximum VF for NEON is calculated by 128/size of smallest type in loop.
+; And while we don't have an instruction to  load 4 x i8, vectorization
+; might still be profitable.
 define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK:       <4 x i8>
+; CHECK:       <16 x i8>
 ;
 entry:
   br label %loop
@@ -32,7 +33,7 @@
 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
 define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
 ; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK:     <4 x i8>
+; CHECK:     <16 x i8>
 ;
 entry:
   br label %loop
@@ -84,7 +85,7 @@
 ; vectorization factor.
 define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
 ; CHECK-LABEL: @test_load_i8_store_i64_large
-; CHECK: <2 x i64>
+; CHECK: <8 x i64>
 ;
 entry:
   br label %loop
Index: llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -116,9 +116,9 @@
 }
 
 ; CHECK-LABEL: @add_d(
-; CHECK: load <4 x i16>
-; CHECK: add nsw <4 x i32>
-; CHECK: store <4 x i32>
+; CHECK: load <8 x i16>
+; CHECK: add nsw <8 x i32>
+; CHECK: store <8 x i32>
 define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
 entry:
   %cmp7 = icmp sgt i32 %len, 0
Index: llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -123,16 +123,16 @@
 ; }
 ;
 ; CHECK: vector.body:
-; CHECK:   phi <8 x i16>
-; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
-; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
-; CHECK:   add <8 x i16>
-; CHECK:   add <8 x i16>
+; CHECK:   phi <16 x i16>
+; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK:   zext <16 x i8> [[Ld1]] to <16 x i16>
+; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK:   zext <16 x i8> [[Ld2]] to <16 x i16>
+; CHECK:   add <16 x i16>
+; CHECK:   add <16 x i16>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
 ; CHECK:   zext i16 [[Rdx]] to i32
 ;
 define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {