diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5619,9 +5619,11 @@ // consider interleaving beneficial (eg. MVE). if (TTI.getMaxInterleaveFactor(VF) <= 1) return false; - // FIXME: We should consider changing the threshold for scalable - // vectors to take VScaleForTuning into account. - if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF) + + unsigned Multiplier = 1; + if (VF.isScalable()) + Multiplier = getVScaleForTuning().value_or(1); + if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll @@ -1,5 +1,7 @@ -; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s -; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \ +; RUN: -enable-epilogue-vectorization=false -S %s | FileCheck --check-prefixes=CHECK,DEFAULT %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -vectorizer-min-trip-count=8 \ +; RUN: -enable-epilogue-vectorization=false -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefixes=CHECK,THRESHOLD %s ; Tests for loops with large numbers of runtime checks. Check that loops are ; vectorized, if the loop trip counts are large and the impact of the runtime diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll @@ -0,0 +1,37 @@ +; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ +; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG +; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ +; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG +; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ +; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG +; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ +; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG + +target triple = "aarch64-unknown-linux-gnu" + +define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i64 %len) #0 { +; CHECK-EPILOG: vec.epilog.ph: +; CHECK-EPILOG: vec.epilog.vector.body: +; CHECK-EPILOG: load + +; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph: +; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body: +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %indvars.iv + %0 = load i16, ptr %arrayidx + %add = add nuw nsw i16 %0, 2 + %arrayidx3 = getelementptr inbounds i16, ptr %q, i64 %indvars.iv + store i16 %add, ptr %arrayidx3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %len + br i1 %exitcond, label %exit, label %for.body + +exit: ; preds = %for.body + ret void +} + +attributes #0 = { "target-features"="+sve" vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -1,13 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=loop-vectorize,dce -mtriple aarch64-linux-gnu -mattr=+sve \ -; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s +; RUN: opt -passes=loop-vectorize,dce -prefer-predicate-over-epilogue=scalar-epilogue \ +; RUN: -enable-epilogue-vectorization=false < %s -S | FileCheck %s -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" ; This should be vscale x 8 vectorized, maybe with some interleaving. -define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef readonly %s, i32 noundef %n) { +define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef readonly %s, i32 noundef %n) #0 { ; CHECK-LABEL: @fneg( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[S2:%.*]] = ptrtoint ptr [[S:%.*]] to i64 @@ -100,3 +99,5 @@ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } + +attributes #0 = { "target-features"="+sve" vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; REQUIRES: asserts ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ -; RUN: -debug-only=loop-vectorize 2>%t < %s | FileCheck %s +; RUN: -enable-epilogue-vectorization=false -debug-only=loop-vectorize 2>%t < %s | FileCheck %s ; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST target triple = "aarch64-unknown-linux-gnu" @@ -17,7 +17,6 @@ ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %conv = zext i8 %0 to i32 ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %conv = zext i8 %0 to i32 ; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %conv = zext i8 %0 to i32 - ; CHECK-LABEL: define void @zext_i8_i16 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: @@ -101,7 +100,6 @@ ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 2 For instruction: %conv = sext i8 %0 to i32 ; CHECK-COST: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %conv = sext i8 %0 to i32 ; CHECK-COST: LV: Found an estimated cost of 0 for VF vscale x 8 For instruction: %conv = sext i8 %0 to i32 - ; CHECK-LABEL: define void @sext_i8_i16 ; CHECK-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: