Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -135,6 +135,12 @@ return ST->getVScaleForTuning(); } + bool shouldMaximizeVectorBandwidth() const { + if (ST->hasSVE()) + return false; + return true; + } + /// Try to return an estimate cost factor that can be used as a multiplier /// when scalarizing an operation for a vector with ElementCount \p VF. /// For scalable vectors this currently takes the most pessimistic view based Index: llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll @@ -4,11 +4,12 @@ ; are not profitable. ; Test with a loop that contains memory accesses of i8 and i32 types. The -; default maximum VF for NEON is 4. And while we don't have an instruction to -; load 4 x i8, vectorization might still be profitable. +; maximum VF for NEON is calculated by 128/size of smallest type in loop. +; And while we don't have an instruction to load 4 x i8, vectorization +; might still be profitable. define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i32( -; CHECK: <4 x i8> +; CHECK: <16 x i8> ; entry: br label %loop @@ -32,7 +33,7 @@ ; Same as test_load_i8_store_i32, but with types flipped for load and store. define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i32_store_i8( -; CHECK: <4 x i8> +; CHECK: <16 x i8> ; entry: br label %loop @@ -84,7 +85,7 @@ ; vectorization factor. define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i64_large -; CHECK: <2 x i64> +; CHECK: <8 x i64> ; entry: br label %loop Index: llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -116,9 +116,9 @@ } ; CHECK-LABEL: @add_d( -; CHECK: load <4 x i16> -; CHECK: add nsw <4 x i32> -; CHECK: store <4 x i32> +; CHECK: load <8 x i16> +; CHECK: add nsw <8 x i32> +; CHECK: store <8 x i32> define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { entry: %cmp7 = icmp sgt i32 %len, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -123,16 +123,16 @@ ; } ; ; CHECK: vector.body: -; CHECK: phi <8 x i16> -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> -; CHECK: add <8 x i16> -; CHECK: add <8 x i16> +; CHECK: phi <16 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> +; CHECK: add <16 x i16> +; CHECK: add <16 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {