Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -122,7 +122,7 @@ "value.")); static cl::opt MaximizeBandwidth( - "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, + "vectorizer-maximize-bandwidth", cl::init(true), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop.")); Index: test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -88,9 +88,9 @@ } ; CHECK-LABEL: @add_c( -; CHECK: load <8 x i8>, <8 x i8>* -; CHECK: add <8 x i16> -; CHECK: store <8 x i16> +; CHECK: load <16 x i8>, <16 x i8>* +; CHECK: add <16 x i16> +; CHECK: store <16 x i16> ; Function Attrs: nounwind define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 { entry: @@ -116,9 +116,9 @@ } ; CHECK-LABEL: @add_d( -; CHECK: load <4 x i16> -; CHECK: add nsw <4 x i32> -; CHECK: store <4 x i32> +; CHECK: load <8 x i16> +; CHECK: add nsw <8 x i32> +; CHECK: store <8 x i32> define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { entry: %cmp7 = icmp sgt i32 %len, 0 @@ -187,16 +187,16 @@ } ; CHECK-LABEL: @add_f -; CHECK: load <8 x i16> -; CHECK: trunc <8 x i16> -; CHECK: shl <8 x i8> -; CHECK: add <8 x i8> -; CHECK: or <8 x i8> -; CHECK: mul <8 x i8> -; CHECK: and <8 x i8> -; CHECK: xor <8 x i8> -; CHECK: mul <8 x i8> -; CHECK: store <8 x i8> +; CHECK: load <16 x i16> +; CHECK: trunc <16 x i16> +; CHECK: shl <16 x i8> +; CHECK: add <16 x i8> +; CHECK: or <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: and <16 x i8> +; CHECK: xor <16 x i8> +; CHECK: mul <16 x i8> +; CHECK: store <16 x i8> define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 { entry: %cmp.32 = icmp sgt i32 %len, 0 Index: test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -123,16 +123,16 @@ ; } ; ; CHECK: vector.body: -; CHECK: phi <8 x i16> -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> -; CHECK: add <8 x i16> -; CHECK: add <8 x i16> +; CHECK: phi <16 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> +; CHECK: add <16 x i16> +; CHECK: add <16 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { Index: test/Transforms/LoopVectorize/ARM/gcc-examples.ll =================================================================== --- test/Transforms/LoopVectorize/ARM/gcc-examples.ll +++ test/Transforms/LoopVectorize/ARM/gcc-examples.ll @@ -35,9 +35,9 @@ } ;CHECK-LABEL: @example10b( -;CHECK: load <4 x i16> -;CHECK: sext <4 x i16> -;CHECK: store <4 x i32> +;CHECK: load <8 x i16> +;CHECK: sext <8 x i16> +;CHECK: store <8 x i32> ;CHECK: ret void define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { br label %1 Index: test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll =================================================================== --- test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll +++ test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll @@ -9,7 +9,9 @@ ; If we need to scalarize the fptoui and then use inserts to build up the ; vector again, then there is certainly no value in going 256-bit wide. -; CHECK-NOT: vpinsrd +; But as we default to maximize bandwidth, we should convert it to 256-bit +; anyway. +; CHECK: vpinsrd define void @convert() { entry: Index: test/Transforms/LoopVectorize/X86/gcc-examples.ll =================================================================== --- test/Transforms/LoopVectorize/X86/gcc-examples.ll +++ test/Transforms/LoopVectorize/X86/gcc-examples.ll @@ -44,17 +44,16 @@ ret void } -; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. ;CHECK-LABEL: @example10b( -;CHECK: load <4 x i16> -;CHECK: sext <4 x i16> -;CHECK: store <4 x i32> +;CHECK: load <8 x i16> +;CHECK: sext <8 x i16> +;CHECK: store <8 x i32> ;CHECK: ret void ;UNROLL-LABEL: @example10b( -;UNROLL: load <4 x i16> -;UNROLL: load <4 x i16> -;UNROLL: store <4 x i32> -;UNROLL: store <4 x i32> +;UNROLL: load <8 x i16> +;UNROLL: load <8 x i16> +;UNROLL: store <8 x i32> +;UNROLL: store <8 x i32> ;UNROLL: ret void define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { br label %1 Index: test/Transforms/LoopVectorize/X86/masked_load_store.ll =================================================================== --- test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -260,20 +260,28 @@ ; } ;} -;AVX-LABEL: @foo3 -;AVX: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64.p0v4f64 -;AVX: sitofp <4 x i32> %wide.load to <4 x double> -;AVX: fadd <4 x double> -;AVX: call void @llvm.masked.store.v4f64.p0v4f64 -;AVX: ret void +;AVX1-LABEL: @foo3 +;AVX1: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64.p0v4f64 +;AVX1: sitofp <4 x i32> %wide.load to <4 x double> +;AVX1: fadd <4 x double> +;AVX1: call void @llvm.masked.store.v4f64.p0v4f64 +;AVX1: ret void + +;AVX2-LABEL: @foo3 +;AVX2: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64.p0v8f64 +;AVX2: sitofp <8 x i32> %wide.load to <8 x double> +;AVX2: fadd <8 x double> +;AVX2: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX2: ret void ;AVX512-LABEL: @foo3 -;AVX512: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64.p0v8f64 -;AVX512: sitofp <8 x i32> %wide.load to <8 x double> -;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16f64.p0v16f64 +;AVX512: sitofp <16 x i32> %wide.load to <16 x double> +;AVX512: fadd <16 x double> +;AVX512: call void @llvm.masked.store.v16f64.p0v16f64 ;AVX512: ret void @@ -502,19 +510,19 @@ ; } ;} ;AVX2-LABEL: @foo6 -;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer -;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> -;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 -;AVX2: fadd <4 x double> -;AVX2: call void @llvm.masked.store.v4f64.p0v4f64 +;AVX2: icmp sgt <8 x i32> %reverse, zeroinitializer +;AVX2: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64.p0v8f64 +;AVX2: fadd <8 x double> +;AVX2: call void @llvm.masked.store.v8f64.p0v8f64 ;AVX2: ret void ;AVX512-LABEL: @foo6 -;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer -;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64.p0v8f64 -;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: icmp sgt <16 x i32> %reverse, zeroinitializer +;AVX512: shufflevector <16 x i1>{{.*}}<16 x i32> @llvm.masked.load.v16f64.p0v16f64 +;AVX512: fadd <16 x double> +;AVX512: call void @llvm.masked.store.v16f64.p0v16f64 ;AVX512: ret void @@ -582,8 +590,8 @@ ; } ;AVX512-LABEL: @foo7 -;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: call <64 x double*> @llvm.masked.load.v64p0f64.p0v64p0f64(<64 x double*>* +;AVX512: call void @llvm.masked.store.v64f64.p0v64f64 ;AVX512: ret void define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 { @@ -654,8 +662,8 @@ ;} ;AVX512-LABEL: @foo8 -;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* % -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 +;AVX512: call <64 x i32 ()*> @llvm.masked.load.v64p0f_i32f.p0v64p0f_i32f(<64 x i32 ()*>* % +;AVX512: call void @llvm.masked.store.v64f64.p0v64f64 ;AVX512: ret void define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 { Index: test/Transforms/LoopVectorize/X86/no_fpmath.ll =================================================================== --- test/Transforms/LoopVectorize/X86/no_fpmath.ll +++ test/Transforms/LoopVectorize/X86/no_fpmath.ll @@ -2,7 +2,7 @@ ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized -; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) +; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" Index: test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll =================================================================== --- test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll +++ test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll @@ -3,7 +3,7 @@ ; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300) ; CHECK: remark: no_fpmath.c:6:14: loop not vectorized -; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300) +; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 4, interleaved count: 2) (hotness: 300) target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" Index: test/Transforms/LoopVectorize/X86/reduction-crash.ll =================================================================== --- test/Transforms/LoopVectorize/X86/reduction-crash.ll +++ test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -7,7 +7,7 @@ define void @test1(float* nocapture %arg, i32 %arg1) nounwind { ; CHECK-LABEL: @test1( ; CHECK: preheader -; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0 +; CHECK: insertelement <4 x double> zeroinitializer, double %tmp, i32 0 ; CHECK: vector.memcheck bb: Index: test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll =================================================================== --- test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll +++ test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll @@ -6,7 +6,7 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1) ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1 Index: test/Transforms/LoopVectorize/X86/vectorization-remarks.ll =================================================================== --- test/Transforms/LoopVectorize/X86/vectorization-remarks.ll +++ test/Transforms/LoopVectorize/X86/vectorization-remarks.ll @@ -6,7 +6,7 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 16, interleaved count: 1) ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1