diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5540,13 +5540,34 @@ } ElementCount MaxVF = MaxVectorElementCount; + // The largest type limits the vectorization factor, but this can be too + // limiting when smaller memory operations are present, which are not + // legal/profitable with the chosen vectorization factor and are only + // profitable with larger vectorization factors. + // + // Try to detect such cases and try increasing the VF in those cases. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + bool NarrowMemOpUnprofitable = false; + if (SmallestType < WidestType) { + Type *SmallVT = + VectorType::get(IntegerType::get(Context, SmallestType), MaxVF); + unsigned MaxVFForSmallTy = PowerOf2Floor( + WidestRegister.divideCoefficientBy(SmallestType).getKnownMinValue()); + ; + Type *SmallMaxPossibleVT = + VectorType::get(IntegerType::get(Context, SmallestType), + MaxVFForSmallTy, MaxVF.isScalable()); + NarrowMemOpUnprofitable = + TTI.getMemoryOpCost(Instruction::Load, SmallVT, Align(1), 0) > + TTI.getMemoryOpCost(Instruction::Load, SmallMaxPossibleVT, Align(1), 0); + } if (TTI.shouldMaximizeVectorBandwidth() || - (MaximizeBandwidth && isScalarEpilogueAllowed())) { + ((MaximizeBandwidth || NarrowMemOpUnprofitable) && + isScalarEpilogueAllowed())) { auto MaxVectorElementCountMaxBW = ElementCount::get( PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), ComputeScalableMaxVF); MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); - // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorElementCount). SmallVector VFs; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll @@ -8,7 +8,8 @@ ; load 4 x i8, vectorization might still be profitable. define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i32( -; CHECK: <4 x i8> +; CHECK: <16 x i8> +; CHECK: <16 x i32> ; entry: br label %loop @@ -32,7 +33,8 @@ ; Same as test_load_i8_store_i32, but with types flipped for load and store. define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i32_store_i8( -; CHECK: <4 x i8> +; CHECK: <16 x i32> +; CHECK: <16 x i8> ; entry: br label %loop @@ -84,7 +86,8 @@ ; vectorization factor. define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i64_large -; CHECK: <2 x i64> +; CHECK: <8 x i8> +; CHECK: <8 x i64> ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll @@ -28,8 +28,8 @@ ; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1). ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). -; VF-4: <4 x i32> -; VF-VSCALE4: +; VF-4: <16 x i32> +; VF-VSCALE4: <16 x i32> define void @test0(i32* %a, i8* %b, i32* %c) #0 { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -9,9 +9,9 @@ define void @test0(i32* %a, i8* %b, i32* %c) #0 { ; CHECK: LV: Checking a loop in "test0" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16 entry: @@ -40,9 +40,9 @@ define void @test1(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test1" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 entry: @@ -72,9 +72,9 @@ define void @test2(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test2" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 entry: @@ -104,9 +104,9 @@ define void @test3(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test3" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 -; CHECK_SCALABLE_ON: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 entry: