Index: llvm/lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -129,11 +129,17 @@ MaxBytesForLoopAlignment = 8; break; case CortexA710: + PrefFunctionLogAlignment = 4; + VScaleForTuning = 1; + PrefLoopLogAlignment = 5; + MaxBytesForLoopAlignment = 16; + break; case CortexX2: PrefFunctionLogAlignment = 4; VScaleForTuning = 1; PrefLoopLogAlignment = 5; MaxBytesForLoopAlignment = 16; + setSVETailFoldingDefaultOpts(TFSimple); break; case A64FX: CacheLineSize = 256; @@ -144,6 +150,7 @@ MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 4; VScaleForTuning = 4; + setSVETailFoldingDefaultOpts(TFSimple); break; case AppleA7: case AppleA10: @@ -200,6 +207,7 @@ PrefLoopLogAlignment = 5; MaxBytesForLoopAlignment = 16; VScaleForTuning = 2; + setSVETailFoldingDefaultOpts(TFSimple); break; case Neoverse512TVB: PrefFunctionLogAlignment = 4; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -40,29 +40,27 @@ class TailFoldingKind { private: - uint8_t Bits = 0; // Currently defaults to disabled. + uint8_t DefaultBits = TFAll; + uint8_t AddBits = 0; + uint8_t RemoveBits = 0; + bool NeedsDefault = true; public: - enum TailFoldingOpts { - TFDisabled = 0x0, - TFReductions = 0x01, - TFRecurrences = 0x02, - TFSimple = 0x80, - TFAll = TFReductions | TFRecurrences | TFSimple - }; + void setDefault(uint8_t V) { DefaultBits = V; } void operator=(const std::string &Val) { if (Val.empty()) return; SmallVector TailFoldTypes; StringRef(Val).split(TailFoldTypes, '+', -1, false); + NeedsDefault = false; for (auto TailFoldType : TailFoldTypes) { if (TailFoldType == "disabled") - Bits = 0; + remove(TFAll); else if (TailFoldType == "all") - Bits = TFAll; + add(TFAll); else if (TailFoldType == "default") - Bits = 0; // Currently defaults to never tail-folding. + NeedsDefault = true; else if (TailFoldType == "simple") add(TFSimple); else if (TailFoldType == "reductions") @@ -83,14 +81,32 @@ } } - operator uint8_t() const { return Bits; } + operator uint8_t() const { + uint8_t Bits = NeedsDefault ? DefaultBits : 0; + Bits |= AddBits; + Bits &= ~RemoveBits; + return Bits; + } + + void add(uint8_t Flags) { + AddBits |= Flags; + RemoveBits &= ~Flags; + } - void add(uint8_t Flag) { Bits |= Flag; } - void remove(uint8_t Flag) { Bits &= ~Flag; } + void remove(uint8_t Flags) { + RemoveBits |= Flags; + AddBits &= ~Flags; + } }; TailFoldingKind TailFoldingKindLoc; +namespace llvm { +void setSVETailFoldingDefaultOpts(uint8_t V) { + TailFoldingKindLoc.setDefault(V); +} +} // namespace llvm + cl::opt> SVETailFolding( "sve-tail-folding", cl::desc( @@ -3029,7 +3045,7 @@ Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI) { - if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) + if (!ST->hasSVE() || TailFoldingKindLoc == TFDisabled) return false; // We don't currently support vectorisation with interleaving for SVE - with @@ -3039,12 +3055,13 @@ return false; TailFoldingKind Required; // Defaults to 0. + Required.setDefault(0); if (LVL->getReductionVars().size()) - Required.add(TailFoldingKind::TFReductions); + Required.add(TFReductions); if (LVL->getFirstOrderRecurrences().size()) - Required.add(TailFoldingKind::TFRecurrences); + Required.add(TFRecurrences); if (!Required) - Required.add(TailFoldingKind::TFSimple); + Required.add(TFSimple); return (TailFoldingKindLoc & Required) == Required; } Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h =================================================================== --- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -511,6 +511,16 @@ } } +enum TailFoldingOpts : uint8_t { + TFDisabled = 0x0, + TFReductions = 0x01, + TFRecurrences = 0x02, + TFSimple = 0x80, + TFAll = TFReductions | TFRecurrences | TFSimple +}; + +void setSVETailFoldingDefaultOpts(uint8_t); + namespace AArch64ExactFPImm { struct ExactFPImm { const char *Name; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9744,6 +9744,11 @@ return CM_ScalarEpilogueAllowed; }; + // If we're forcing the use of epilogue vectorization we should honour that + // instead of the TTI hook behaviour. + if (EpilogueVectorizationForceVF.getNumOccurrences()) + return CM_ScalarEpilogueAllowed; + // 4) if the TTI hook indicates this is profitable, request predication. if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) return CM_ScalarEpilogueNotNeededUsePredicate; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -1,10 +1,14 @@ ; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF -; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -loop-vectorize -S | FileCheck %s -check-prefix=CHECK-TF ; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF -; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -loop-vectorize -sve-tail-folding=default+disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences | FileCheck %s -check-prefix=CHECK-TF ; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED -; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC +; RUN: opt < %s -loop-vectorize -sve-tail-folding=default+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC ; RUN: opt < %s -loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED +; RUN: opt < %s -loop-vectorize -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 +; RUN: opt < %s -loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 target triple = "aarch64-unknown-linux-gnu" @@ -49,6 +53,14 @@ ; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi ; CHECK-TF-ONLYRED: store %[[SPLAT]], * +; CHECK-NEOVERSE-V1-LABEL: @simple_memset( +; CHECK-NEOVERSE-V1: vector.ph: +; CHECK-NEOVERSE-V1: %[[INSERT:.*]] = insertelement poison, i32 %val, i32 0 +; CHECK-NEOVERSE-V1: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-NEOVERSE-V1: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + entry: br label %while.body @@ -110,6 +122,15 @@ ; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] ; CHECK-TF-ONLYRED: middle.block: ; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[SEL]]) + +; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi +; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load +; CHECK-NEOVERSE-V1: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-NEOVERSE-V1: middle.block: +; CHECK-NEOVERSE-V1-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[ADD]]) + entry: br label %for.body @@ -193,6 +214,19 @@ ; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] ; CHECK-TF-ONLYRED: store %[[ADD]] +; CHECK-NEOVERSE-V1-LABEL: @add_recur +; CHECK-NEOVERSE-V1: entry: +; CHECK-NEOVERSE-V1: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-NEOVERSE-V1: vector.ph: +; CHECK-NEOVERSE-V1: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi +; CHECK-NEOVERSE-V1: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-NEOVERSE-V1: %[[LOAD]] = load +; CHECK-NEOVERSE-V1: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-NEOVERSE-V1: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-NEOVERSE-V1: store %[[ADD]] + entry: %.pre = load i32, i32* %src, align 4 br label %for.body @@ -238,6 +272,12 @@ ; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> ; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-NEOVERSE-V1-LABEL: @interleave( +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + entry: br label %for.body