diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1196,7 +1196,7 @@ return VLOperands::ScoreFail; // The distance is too large - still may be profitable to use masked // loads/gathers. - if (std::abs(*Dist) > NumLanes / 2) + if (std::abs(*Dist) > 1) return VLOperands::ScoreAltOpcodes; // This still will detect consecutive loads, but we might have "holes" // in some cases. It is ok for non-power-2 vectorization and may produce diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -0,0 +1,115 @@ +; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + +@a = common dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 16 + +define void @s116_orig() { +; CHECK-LABEL: @s116_orig( +; CHECK: [[BASE:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0 +; CHECK: [[BASE2:%.*]] = bitcast float* [[BASE]] to <4 x float>* +; CHECK: [[VEC1:%.*]] = load <4 x float>, <4 x float>* [[BASE2]] +; CHECK: [[VEC2A:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i32 0 +; CHECK: [[EL1:%.*]] = extractelement <4 x float> [[VEC1]], i32 0 +; CHECK: [[VEC2B:%.*]] = insertelement <4 x float> [[VEC2A]], float [[EL1]], i32 1 +; CHECK: [[EL2:%.*]] = extractelement <4 x float> [[VEC1]], i32 1 +; CHECK: [[VEC2C:%.*]] = insertelement <4 x float> [[VEC2B]], float [[EL2]], i32 2 +; CHECK: [[EL3:%.*]] = extractelement <4 x float> [[VEC1]], i32 2 +; CHECK: [[VEC2D:%.*]] = insertelement <4 x float> [[VEC2C]], float [[EL3]], i32 3 +; CHECK: [[FMUL1:%.*]] = fmul fast <4 x float> [[VEC1]], [[VEC2D]] +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = add nuw nsw i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0 + %1 = load float, float* %arrayidx, align 4, !tbaa !2 + %arrayidx2 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx2, align 4, !tbaa !2 + %mul = fmul fast float %2, %1 + store float %mul, float* %arrayidx2, align 4, !tbaa !2 + %3 = add nuw nsw i64 %indvars.iv, 2 + %arrayidx7 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %3 + %4 = load float, float* %arrayidx7, align 4, !tbaa !2 + %mul11 = fmul fast float %4, %1 + store float %mul11, float* %arrayidx, align 4, !tbaa !2 + %5 = add nuw nsw i64 %indvars.iv, 3 + %arrayidx17 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %5 + %6 = load float, float* %arrayidx17, align 4, !tbaa !2 + %mul21 = fmul fast float %6, %4 + store float %mul21, float* %arrayidx7, align 4, !tbaa !2 + %7 = add nuw nsw i64 %indvars.iv, 4 + %arrayidx27 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %7 + %8 = load float, float* %arrayidx27, align 4, !tbaa !2 + %mul31 = fmul fast float %8, %6 + store float %mul31, float* %arrayidx17, align 4, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5 + %arrayidx37 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next + %9 = load float, float* %arrayidx37, align 4, !tbaa !2 + %mul41 = fmul fast float %9, %8 + store float %mul41, float* %arrayidx27, align 4, !tbaa !2 + %cmp = icmp ult i64 %indvars.iv.next, 31995 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +define void @s116_mod() { +; CHECK-LABEL: @s116_mod( +; CHECK: [[BASE:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0 +; CHECK: [[BASE2:%.*]] = bitcast float* [[BASE]] to <4 x float>* +; CHECK: [[VEC1:%.*]] = load <4 x float>, <4 x float>* [[BASE2]] +; CHECK: [[VEC2A:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i32 0 +; CHECK: [[EL1:%.*]] = extractelement <4 x float> [[VEC1]], i32 0 +; CHECK: [[VEC2B:%.*]] = insertelement <4 x float> [[VEC2A]], float [[EL1]], i32 1 +; CHECK: [[EL2:%.*]] = extractelement <4 x float> [[VEC1]], i32 1 +; CHECK: [[VEC2C:%.*]] = insertelement <4 x float> [[VEC2B]], float [[EL2]], i32 2 +; CHECK: [[EL3:%.*]] = extractelement <4 x float> [[VEC1]], i32 2 +; CHECK: [[VEC2D:%.*]] = insertelement <4 x float> [[VEC2C]], float [[EL3]], i32 3 +; CHECK: [[FMUL1:%.*]] = fmul fast <4 x float> [[VEC1]], [[VEC2D]] +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = or i64 %indvars.iv, 1 + %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %0 + %1 = load float, float* %arrayidx, align 4, !tbaa !2 + %arrayidx2 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv + %2 = load float, float* %arrayidx2, align 16, !tbaa !2 + %mul = fmul fast float %2, %1 + store float %mul, float* %arrayidx2, align 16, !tbaa !2 + %3 = or i64 %indvars.iv, 2 + %arrayidx7 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %3 + %4 = load float, float* %arrayidx7, align 8, !tbaa !2 + %mul11 = fmul fast float %4, %1 + store float %mul11, float* %arrayidx, align 4, !tbaa !2 + %5 = or i64 %indvars.iv, 3 + %arrayidx17 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %5 + %6 = load float, float* %arrayidx17, align 4, !tbaa !2 + %mul21 = fmul fast float %6, %4 + store float %mul21, float* %arrayidx7, align 8, !tbaa !2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %arrayidx27 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next + %7 = load float, float* %arrayidx27, align 16, !tbaa !2 + %mul31 = fmul fast float %7, %6 + store float %mul31, float* %arrayidx17, align 4, !tbaa !2 + %cmp = icmp ult i64 %indvars.iv.next, 31996 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="thunderx2t99" "target-features"="+crc,+crypto,+fp-armv8,+lse,+neon,+rdm,+v8.1a" "unsafe-fp-math"="true" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 7.0.2"} +!2 = !{!3, !3, i64 0} +!3 = !{!"float", !4, i64 0} +!4 = !{!"omnipotent char", !5, i64 0} +!5 = !{!"Simple C/C++ TBAA"}