diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5783,9 +5783,29 @@ return ElementCount::getFixed(ConstTripCount); } + LLVMContext &Context = TheLoop->getHeader()->getContext(); + // The largest type limits the vectorization factor, but this can be too + // limiting when smaller memory operations are present, which are not + // legal/profitable with the chosen vectorization factor and are only + // profitable with larger vectorization factors. + // + // Try to detect such cases and try increasing the VF in those cases. + bool NarrowMemOpUnprofitable = false; + if (SmallestType <= 32 && SmallestType < WidestType && + !MaxVectorSize.isScalable()) { + Type *SmallVT = FixedVectorType::get( + IntegerType::get(Context, SmallestType), MaxVectorSize.getFixedValue()); + Type *SmallMaxPossibleVT = + FixedVectorType::get(IntegerType::get(Context, SmallestType), + PowerOf2Floor(WidestRegister / SmallestType)); + NarrowMemOpUnprofitable = + TTI.getMemoryOpCost(Instruction::Load, SmallVT, Align(1), 0) > + TTI.getMemoryOpCost(Instruction::Load, SmallMaxPossibleVT, Align(1), 0); + } ElementCount MaxVF = MaxVectorSize; if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || - (MaximizeBandwidth && isScalarEpilogueAllowed())) { + ((MaximizeBandwidth || NarrowMemOpUnprofitable) && + isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector VFs; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll @@ -9,7 +9,8 @@ ; i8 memory accesses become profitable. define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i32( -; CHECK-NOT: x i8> +; CHECK: <16 x i8> +; CHECK: <16 x i32> ; entry: br label %loop @@ -33,7 +34,8 @@ ; Same as test_load_i8_store_i32, but with types flipped for load and store. define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i32_store_i8( -; CHECK: <4 x i8> +; CHECK: <16 x i32> +; CHECK: <16 x i8> ; entry: br label %loop @@ -85,7 +87,8 @@ ; vectorization factor. define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i64_large -; CHECK: <2 x i64> +; CHECK: <8 x i8> +; CHECK: <8 x i64> ; entry: br label %loop