diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1196,7 +1196,7 @@ return VLOperands::ScoreFail; // The distance is too large - still may be profitable to use masked // loads/gathers. - if (std::abs(*Dist) > NumLanes / 2) + if (std::abs(*Dist) > 1) return VLOperands::ScoreAltOpcodes; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -0,0 +1,32 @@ +; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown | FileCheck %s + +; This test is reduced from the TSVC evaluation of vectorizers: +; https://github.com/llvm/llvm-test-suite/commits/main/MultiSource/Benchmarks/TSVC/LoopRerolling-flt/tsc.c + +define void @s116_modified(float* %a) { +; CHECK-LABEL: @s116_modified( +; CHECK: load <4 x float>, <4 x float>* +; CHECK: fmul fast <4 x float> +; CHECK: store <4 x float> + %gep0 = getelementptr inbounds float, float* %a, i64 0 + %gep1 = getelementptr inbounds float, float* %a, i64 1 + %gep2 = getelementptr inbounds float, float* %a, i64 2 + %gep3 = getelementptr inbounds float, float* %a, i64 3 + %gep4 = getelementptr inbounds float, float* %a, i64 4 + %ld0 = load float, float* %gep0 + %ld1 = load float, float* %gep1 + %ld2 = load float, float* %gep2 + %ld3 = load float, float* %gep3 + %ld4 = load float, float* %gep4 + %mul0 = fmul fast float %ld0, %ld1 + %mul1 = fmul fast float %ld2, %ld1 + %mul2 = fmul fast float %ld3, %ld2 + %mul3 = fmul fast float %ld4, %ld3 + store float %mul0, float* %gep0 + store float %mul1, float* %gep1 + store float %mul2, float* %gep2 + store float %mul3, float* %gep3 + ret void +} + +