Index: llvm/lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -226,6 +226,7 @@ PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; VScaleForTuning = 2; + setSVETailFoldingDefaultOpts(TFSimple); break; case Neoverse512TVB: PrefFunctionAlignment = Align(16); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -42,64 +42,97 @@ namespace { class TailFoldingKind { private: - uint8_t Bits = 0; // Currently defaults to disabled. + uint8_t Bits; public: - enum TailFoldingOpts { - TFDisabled = 0x0, - TFReductions = 0x01, - TFRecurrences = 0x02, - TFReverse = 0x04, - TFSimple = 0x80, - TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple - }; + TailFoldingKind(uint8_t Bits) : Bits(Bits) {} - void operator=(const std::string &Val) { - if (Val.empty()) - return; - SmallVector TailFoldTypes; - StringRef(Val).split(TailFoldTypes, '+', -1, false); + void add(uint8_t Bit) { Bits |= Bit; } + + void remove(uint8_t Bit) { Bits &= ~Bit; } + + operator uint8_t() const { return Bits; } +}; + +class TailFoldingOption { +private: + uint8_t DefaultBits = TFDisabled; + std::string OrigVal; + SmallVector TailFoldTypes; + + uint8_t getBits() const { + if (!TailFoldTypes.size()) + return DefaultBits; + + TailFoldingKind Bits(0); for (auto TailFoldType : TailFoldTypes) { if (TailFoldType == "disabled") - Bits = 0; + Bits.remove(TFAll); else if (TailFoldType == "all") - Bits = TFAll; + Bits.add(TFAll); else if (TailFoldType == "default") - Bits = 0; // Currently defaults to never tail-folding. + Bits.add(DefaultBits); else if (TailFoldType == "simple") - add(TFSimple); + Bits.add(TFSimple); else if (TailFoldType == "reductions") - add(TFReductions); + Bits.add(TFReductions); else if (TailFoldType == "recurrences") - add(TFRecurrences); + Bits.add(TFRecurrences); else if (TailFoldType == "reverse") - add(TFReverse); + Bits.add(TFReverse); else if (TailFoldType == "noreductions") - remove(TFReductions); + Bits.remove(TFReductions); else if (TailFoldType == "norecurrences") - remove(TFRecurrences); + Bits.remove(TFRecurrences); else if (TailFoldType == "noreverse") - remove(TFReverse); - else { + Bits.remove(TFReverse); + else + llvm_unreachable("No! That's impossible!"); + } + + return Bits; + } + +public: + + void setDefault(uint8_t V) { DefaultBits = V; } + + void operator=(const std::string &Val) { + if (Val.empty()) + return; + + OrigVal = Val; + StringRef(OrigVal).split(TailFoldTypes, '+', -1, false); + for (auto TailFoldType : TailFoldTypes) { + if (TailFoldType != "disabled" && TailFoldType != "all" && + TailFoldType != "default" && TailFoldType != "simple" && + TailFoldType != "reductions" && TailFoldType != "recurrences" && + TailFoldType != "reverse" && TailFoldType != "noreductions" && + TailFoldType != "norecurrences" && TailFoldType != "noreverse") { errs() << "invalid argument " << TailFoldType.str() << " to -sve-tail-folding=; each element must be one of: disabled, " "all, default, simple, reductions, noreductions, recurrences, " - "norecurrences\n"; + "norecurrences, reverse, noreverse\n"; } } } - operator uint8_t() const { return Bits; } - - void add(uint8_t Flag) { Bits |= Flag; } - void remove(uint8_t Flag) { Bits &= ~Flag; } + bool satisfies(TailFoldingKind Required) const { + return (getBits() & Required) == Required; + } }; } // namespace -TailFoldingKind TailFoldingKindLoc; +TailFoldingOption TailFoldingOptionLoc; + +namespace llvm { +void setSVETailFoldingDefaultOpts(uint8_t V) { + TailFoldingOptionLoc.setDefault(V); +} +} // namespace llvm -cl::opt> SVETailFolding( +cl::opt> SVETailFolding( "sve-tail-folding", cl::desc( "Control the use of vectorisation using tail-folding for SVE:" @@ -114,7 +147,7 @@ "recurrences" "\nreverse Use tail-folding for loops requiring reversed " "predicates"), - cl::location(TailFoldingKindLoc)); + cl::location(TailFoldingOptionLoc)); // Experimental option that will only be fully functional when the // code-generator is changed to use SVE instead of NEON for all fixed-width @@ -3450,7 +3483,7 @@ } bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { - if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) + if (!ST->hasSVE()) return false; // We don't currently support vectorisation with interleaving for SVE - with @@ -3459,22 +3492,22 @@ if (TFI->IAI->hasGroups()) return false; - TailFoldingKind Required; // Defaults to 0. + TailFoldingKind Required(0); if (TFI->LVL->getReductionVars().size()) - Required.add(TailFoldingKind::TFReductions); + Required.add(TFReductions); if (TFI->LVL->getFixedOrderRecurrences().size()) - Required.add(TailFoldingKind::TFRecurrences); + Required.add(TFRecurrences); // We call this to discover whether any load/store pointers in the loop have // negative strides. This will require extra work to reverse the loop // predicate, which may be expensive. if (containsDecreasingPointers(TFI->LVL->getLoop(), TFI->LVL->getPredicatedScalarEvolution())) - Required.add(TailFoldingKind::TFReverse); + Required.add(TFReverse); if (!Required) - Required.add(TailFoldingKind::TFSimple); + Required.add(TFSimple); - return (TailFoldingKindLoc & Required) == Required; + return TailFoldingOptionLoc.satisfies(Required); } InstructionCost Index: llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h =================================================================== --- llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -529,6 +529,17 @@ } } +enum TailFoldingOpts : uint8_t { + TFDisabled = 0x0, + TFReductions = 0x01, + TFRecurrences = 0x02, + TFReverse = 0x04, + TFSimple = 0x80, + TFAll = TFReductions | TFRecurrences | TFSimple | TFReverse +}; + +void setSVETailFoldingDefaultOpts(uint8_t); + namespace AArch64ExactFPImm { struct ExactFPImm { const char *Name; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll @@ -1,7 +1,5 @@ ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ -; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG -; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ -; RUN: -mcpu=neoverse-v1 < %s | FileCheck %s --check-prefix=CHECK-EPILOG +; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ ; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG ; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \ Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -1,11 +1,15 @@ ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S | FileCheck %s -check-prefix=CHECK-NOTF ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF -; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default+disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default+reductions+recurrences+reverse | FileCheck %s -check-prefix=CHECK-TF ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S -sve-tail-folding=default -mcpu=neoverse-v1 | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -S -mcpu=neoverse-v1 -sve-tail-folding=default | FileCheck %s -check-prefix=CHECK-NEOVERSE-V1 target triple = "aarch64-unknown-linux-gnu" @@ -58,6 +62,14 @@ ; CHECK-TF-ONLYRED-NOT: %{{.*}} = phi ; CHECK-TF-ONLYRED: store %[[SPLAT]], * +; CHECK-NEOVERSE-V1-LABEL: @simple_memset( +; CHECK-NEOVERSE-V1: vector.ph: +; CHECK-NEOVERSE-V1: %[[INSERT:.*]] = insertelement poison, i32 %val, i64 0 +; CHECK-NEOVERSE-V1: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-NEOVERSE-V1: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + entry: br label %while.body @@ -129,6 +141,15 @@ ; CHECK-TF-ONLYRED: %[[SEL:.*]] = select fast %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] ; CHECK-TF-ONLYRED: middle.block: ; CHECK-TF-ONLYRED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[SEL]]) + +; CHECK-NEOVERSE-V1-LABEL: @fadd_red_fast +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi +; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load +; CHECK-NEOVERSE-V1: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-NEOVERSE-V1: middle.block: +; CHECK-NEOVERSE-V1-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[ADD]]) + entry: br label %for.body @@ -225,6 +246,19 @@ ; CHECK-TF-ONLYRED: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] ; CHECK-TF-ONLYRED: store %[[ADD]] +; CHECK-NEOVERSE-V1-LABEL: @add_recur +; CHECK-NEOVERSE-V1: entry: +; CHECK-NEOVERSE-V1: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-NEOVERSE-V1: vector.ph: +; CHECK-NEOVERSE-V1: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1-NOT: %{{.*}} = phi +; CHECK-NEOVERSE-V1: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-NEOVERSE-V1: %[[LOAD]] = load +; CHECK-NEOVERSE-V1: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-NEOVERSE-V1: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-NEOVERSE-V1: store %[[ADD]] + entry: %.pre = load i32, i32* %src, align 4 br label %for.body @@ -276,6 +310,12 @@ ; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> ; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-NEOVERSE-V1-LABEL: @interleave( +; CHECK-NEOVERSE-V1: vector.body: +; CHECK-NEOVERSE-V1: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-NEOVERSE-V1: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + entry: br label %for.body @@ -335,6 +375,12 @@ ; CHECK-TF-NOREC: %[[REVERSE_MASK:.*]] = call @llvm.experimental.vector.reverse.nxv2i1( %[[ACTIVE_LANE_MASK]]) ; CHECK-TF-NOREC: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv2f64.p0nxv2f64({{.*}} %reverse +; CHECK-TF-NEOVERSE-V1-LABEL: @reverse( +; CHECK-TF-NEOVERSE-V1: vector.body: +; CHECK-TF-NEOVERSE-V1-NOT: %{{.*}} = phi +; CHECK-TF-NEOVERSE-V1: %[[LOAD:.*]] = load , * %18, align 8 +; CHECK-TF-NEOVERSE-V1: %{{.*}} = call @llvm.experimental.vector.reverse.nxv2f64( %[[LOAD]]) + entry: br label %for.body