Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -171,6 +171,10 @@
                                  bool IsPairwiseForm);
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+
+  bool shouldMaximizeVectorBandwidth(bool OptSize) const {
+    return !OptSize;
+  }
   /// @}
 };
 
Index: test/Transforms/LoopVectorize/AArch64/aarch64-trunc-vec.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/AArch64/aarch64-trunc-vec.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Originl source code:
+;
+; void store_i32_to_i8 (const int *src, int width, unsigned char *dst)
+; {
+;  for (int i = 0; i < width; i++) {
+;    *dst++ = *src++;
+;  }
+;}
+
+; Function Attrs: norecurse nounwind
+define void @store_i32_to_i8(i32* noalias nocapture readonly %src, i32 %width, i8* nocapture %dst) {
+; CHECK-LABEL: store_i32_to_i8
+; CHECK-LABEL: vector.body:
+; CHECK:         [[WIDE_LOAD0:%.*]] = load <16 x i32>, <16 x i32>* {{%.*}}, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* {{%.*}}, i32 16
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
+; CHECK:         [[WIDE_LOAD1:%.*]] = load <16 x i32>, <16 x i32>* {{%.*}}, align 4
+; CHECK-NEXT:    [[TRUNC1:%.*]] = trunc <16 x i32> [[WIDE_LOAD0]] to <16 x i8>
+; CHECK-NEXT:    [[TRUNC2:%.*]] = trunc <16 x i32> [[WIDE_LOAD1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* {{%.*}}, i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TRUNC1]], <16 x i8>* [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* {{%.*}}, i32 16
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TRUNC2]], <16 x i8>* [[TMP19]], align 1
+entry:
+  %cmp4 = icmp sgt i32 %width, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %dst.addr.06 = phi i8* [ %incdec.ptr1, %for.body ], [ %dst, %for.body.preheader ]
+  %src.addr.05 = phi i32* [ %incdec.ptr, %for.body ], [ %src, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %src.addr.05, i64 1
+  %0 = load i32, i32* %src.addr.05, align 4
+  %conv = trunc i32 %0 to i8
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %dst.addr.06, i64 1
+  store i8 %conv, i8* %dst.addr.06, align 1
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %width
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -88,9 +88,9 @@
 }
 
 ; CHECK-LABEL: @add_c(
-; CHECK: load <8 x i8>, <8 x i8>*
-; CHECK: add <8 x i16>
-; CHECK: store <8 x i16>
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add <16 x i16>
+; CHECK: store <16 x i16>
 ; Function Attrs: nounwind
 define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
 entry:
@@ -116,9 +116,9 @@
 }
 
 ; CHECK-LABEL: @add_d(
-; CHECK: load <4 x i16>
-; CHECK: add nsw <4 x i32>
-; CHECK: store <4 x i32>
+; CHECK: load <8 x i16>
+; CHECK: add nsw <8 x i32>
+; CHECK: store <8 x i32>
 define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
 entry:
   %cmp7 = icmp sgt i32 %len, 0
@@ -187,16 +187,16 @@
 }
 
 ; CHECK-LABEL: @add_f
-; CHECK: load <8 x i16>
-; CHECK: trunc <8 x i16>
-; CHECK: shl <8 x i8>
-; CHECK: add <8 x i8>
-; CHECK: or <8 x i8>
-; CHECK: mul <8 x i8>
-; CHECK: and <8 x i8>
-; CHECK: xor <8 x i8>
-; CHECK: mul <8 x i8>
-; CHECK: store <8 x i8>
+; CHECK: load <16 x i16>
+; CHECK: trunc <16 x i16>
+; CHECK: shl <16 x i8>
+; CHECK: add <16 x i8>
+; CHECK: or <16 x i8>
+; CHECK: mul <16 x i8>
+; CHECK: and <16 x i8>
+; CHECK: xor <16 x i8>
+; CHECK: mul <16 x i8>
+; CHECK: store <16 x i8>
 define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 entry:
   %cmp.32 = icmp sgt i32 %len, 0
Index: test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -123,16 +123,16 @@
 ; }
 ;
 ; CHECK: vector.body:
-; CHECK:   phi <8 x i16>
-; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
-; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
-; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
-; CHECK:   add <8 x i16>
-; CHECK:   add <8 x i16>
+; CHECK:   phi <16 x i16>
+; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK:   zext <16 x i8> [[Ld1]] to <16 x i16>
+; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
+; CHECK:   zext <16 x i8> [[Ld2]] to <16 x i16>
+; CHECK:   add <16 x i16>
+; CHECK:   add <16 x i16>
 ;
 ; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>
 ; CHECK:   zext i16 [[Rdx]] to i32
 ;
 define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {