diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1196,7 +1196,7 @@ return VLOperands::ScoreFail; // The distance is too large - still may be profitable to use masked // loads/gathers. - if (std::abs(*Dist) > NumLanes / 2) + if (std::abs(*Dist) > 1) return VLOperands::ScoreAltOpcodes; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown | FileCheck %s + +; This test is reduced from the TSVC evaluation of vectorizers: +; https://github.com/llvm/llvm-test-suite/commits/main/MultiSource/Benchmarks/TSVC/LoopRerolling-flt/tsc.c + +define void @s116_modified(float* %a) { +; CHECK-LABEL: @s116_modified( +; CHECK: [[VEC1:%.*]] = load <4 x float>, <4 x float>* %{{.*}} +; CHECK: [[VEC2A:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i32 0 +; CHECK: [[EL1:%.*]] = extractelement <4 x float> [[VEC1]], i32 0 +; CHECK: [[VEC2B:%.*]] = insertelement <4 x float> [[VEC2A]], float [[EL1]], i32 1 +; CHECK: [[EL2:%.*]] = extractelement <4 x float> [[VEC1]], i32 1 +; CHECK: [[VEC2C:%.*]] = insertelement <4 x float> [[VEC2B]], float [[EL2]], i32 2 +; CHECK: [[EL3:%.*]] = extractelement <4 x float> [[VEC1]], i32 2 +; CHECK: [[VEC2D:%.*]] = insertelement <4 x float> [[VEC2C]], float [[EL3]], i32 3 +; CHECK: [[FMUL1:%.*]] = fmul fast <4 x float> [[VEC1]], [[VEC2D]] +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %offset_1 = or i64 %indvars.iv, 1 + %offset_2 = or i64 %indvars.iv, 2 + %offset_3 = or i64 %indvars.iv, 3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %arrayidx = getelementptr inbounds float, float* %a, i64 %offset_1 + %0 = load float, float* %arrayidx + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + %1 = load float, float* %arrayidx2 + %arrayidx7 = getelementptr inbounds float, float* %a, i64 %offset_2 + %2 = load float, float* %arrayidx7 + %arrayidx17 = getelementptr inbounds float, float* %a, i64 %offset_3 + %3 = load float, float* %arrayidx17 + %arrayidx27 = getelementptr inbounds float, float* %a, i64 %indvars.iv.next + %4 = load float, float* %arrayidx27 + %mul = fmul fast float %1, %0 + %mul11 = fmul fast float %2, %0 + %mul21 = fmul fast float %3, %2 + %mul31 = fmul fast float %4, %3 + store float %mul, float* %arrayidx2 + store float %mul11, float* %arrayidx + store float %mul21, float* %arrayidx7 + store float %mul31, float* %arrayidx17 + %cmp = icmp ult i64 %indvars.iv.next, 100 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} +