Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -52,6 +52,7 @@ class LoopAccessInfo; class Loop; class LoopInfo; +class LoopVectorizationLegality; class ProfileSummaryInfo; class RecurrenceDescriptor; class SCEV; @@ -530,7 +531,7 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI) const; + LoopVectorizationLegality *LVL) const; /// Query the target whether lowering of the llvm.get.active.lane.mask /// intrinsic is supported and how the mask should be used. A return value @@ -1555,10 +1556,12 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) = 0; - virtual bool - preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, - AssumptionCache &AC, TargetLibraryInfo *TLI, - DominatorTree *DT, const LoopAccessInfo *LAI) = 0; + virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT, + LoopVectorizationLegality *LVL) = 0; virtual PredicationStyle emitGetActiveLaneMask() = 0; virtual Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; @@ -1935,8 +1938,8 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI) override { - return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); + LoopVectorizationLegality *LVL) override { + return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); } PredicationStyle emitGetActiveLaneMask() override { return Impl.emitGetActiveLaneMask(); Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -163,7 +163,7 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI) const { + LoopVectorizationLegality *LVL) const { return false; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -603,8 +603,8 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI) { - return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); + LoopVectorizationLegality *LVL) { + return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); } PredicationStyle emitGetActiveLaneMask() { Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -294,8 +294,8 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI) const { - return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); + LoopVectorizationLegality *LVL) const { + return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); } PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -340,6 +340,11 @@ return PredicationStyle::None; } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, + LoopVectorizationLegality *LVL); + bool supportsScalableVectors() const { return ST->hasSVE(); } bool enableScalableVectorization() const { return ST->hasSVE(); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include using namespace llvm; using namespace llvm::PatternMatch; @@ -37,6 +38,74 @@ static cl::opt SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden); +class TailFoldingKind { +private: + uint8_t Bits = 0; // Currently defaults to disabled. + +public: + enum TailFoldingOpts { + TFDisabled = 0x0, + TFReductions = 0x01, + TFRecurrences = 0x02, + TFSimple = 0x80, + TFAll = TFReductions | TFRecurrences | TFSimple + }; + + void operator=(const std::string &Val) { + if (Val.empty()) + return; + SmallVector TailFoldTypes; + StringRef(Val).split(TailFoldTypes, '+', -1, false); + for (auto TailFoldType : TailFoldTypes) { + if (TailFoldType == "disabled") + Bits = 0; + else if (TailFoldType == "all") + Bits = TFAll; + else if (TailFoldType == "default") + Bits = 0; // Currently defaults to never tail-folding. + else if (TailFoldType == "simple") + Bits = TFSimple; + else if (TailFoldType == "reductions") + add(TFReductions); + else if (TailFoldType == "recurrences") + add(TFRecurrences); + else if (TailFoldType == "noreductions") + remove(TFReductions); + else if (TailFoldType == "norecurrences") + remove(TFRecurrences); + else { + errs() + << "invalid argument " << TailFoldType.str() + << " to -sve-tail-folding=; each element must be one of: disabled, " + "all, default, simple, reductions, noreductions, recurrences, " + "norecurrences\n"; + } + } + } + + operator uint8_t() const { return Bits; } + + void add(uint8_t Flag) { Bits |= Flag; } + void remove(uint8_t Flag) { Bits &= ~Flag; } +}; + +TailFoldingKind TailFoldingKindLoc; + +cl::opt> SVETailFolding( + "sve-tail-folding", + cl::desc( + "Control the use of vectorisation using tail-folding for SVE:" + "\ndisabled No loop types will vectorize using tail-folding" + "\ndefault Uses the default tail-folding settings for the target " + "CPU" + "\nall All legal loop types will vectorize using tail-folding" + "\nsimple Use tail-folding for simple loops (not reductions or " + "recurrences)" + "\nreductions Use tail-folding for loops containing reductions" + "\nrecurrences Use tail-folding for loops containing first order " + "recurrences"), + cl::location(TailFoldingKindLoc)); + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -2955,3 +3024,20 @@ return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } + +bool AArch64TTIImpl::preferPredicateOverEpilogue( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { + if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) + return false; + + if (!(TailFoldingKindLoc & TailFoldingKind::TFReductions) && + LVL->getReductionVars().size()) + return false; + + if (!(TailFoldingKindLoc & TailFoldingKind::TFRecurrences) && + LVL->getFirstOrderRecurrences().size()) + return false; + + return true; +} Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -288,12 +288,10 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); - bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, - ScalarEvolution &SE, - AssumptionCache &AC, - TargetLibraryInfo *TLI, + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - const LoopAccessInfo *LAI); + LoopVectorizationLegality *LVL); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -20,8 +20,8 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -33,6 +33,7 @@ #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include #include #include @@ -2197,12 +2198,9 @@ return true; } -bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, - ScalarEvolution &SE, - AssumptionCache &AC, - TargetLibraryInfo *TLI, - DominatorTree *DT, - const LoopAccessInfo *LAI) { +bool ARMTTIImpl::preferPredicateOverEpilogue( + Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { if (!EnableTailPredication) { LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); return false; @@ -2244,7 +2242,7 @@ return false; } - return canTailPredicateLoop(L, LI, SE, DL, LAI); + return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI()); } PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const { Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9761,8 +9761,7 @@ }; // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, - LVL.getLAI())) + if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -0,0 +1,190 @@ +; RUN: opt < %s -loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt < %s -loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF +; RUN: opt < %s -loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -loop-vectorize -sve-tail-folding=simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED +; RUN: opt < %s -loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC + +target triple = "aarch64-unknown-linux-gnu" + +define void @simple_memset(i32 %val, i32* %ptr, i64 %n) #0 { +; CHECK-NOTF-LABEL: @simple_memset( +; CHECK-NOTF: vector.ph: +; CHECK-NOTF: %[[INSERT:.*]] = insertelement poison, i32 %val, i32 0 +; CHECK-NOTF: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-NOTF: vector.body: +; CHECK-NOTF-NOT: %{{.*}} = phi +; CHECK-NOTF: store %[[SPLAT]], * + +; CHECK-TF-NORED-LABEL: @simple_memset( +; CHECK-TF-NORED: vector.ph: +; CHECK-TF-NORED: %[[INSERT:.*]] = insertelement poison, i32 %val, i32 0 +; CHECK-TF-NORED: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-TF-NORED: vector.body: +; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + +; CHECK-TF-NOREC-LABEL: @simple_memset( +; CHECK-TF-NOREC: vector.ph: +; CHECK-TF-NOREC: %[[INSERT:.*]] = insertelement poison, i32 %val, i32 0 +; CHECK-TF-NOREC: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-TF-NOREC: vector.body: +; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NOREC: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + +; CHECK-TF-LABEL: @simple_memset( +; CHECK-TF: vector.ph: +; CHECK-TF: %[[INSERT:.*]] = insertelement poison, i32 %val, i32 0 +; CHECK-TF: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + +entry: + br label %while.body + +while.body: ; preds = %while.body, %entry + %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] + %gep = getelementptr i32, i32* %ptr, i64 %index + store i32 %val, i32* %gep + %index.next = add nsw i64 %index, 1 + %cmp10 = icmp ult i64 %index.next, %n + br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 + +while.end.loopexit: ; preds = %while.body + ret void +} + +define float @fadd_red_fast(float* noalias nocapture readonly %a, i64 %n) #0 { +; CHECK-NOTF-LABEL: @fadd_red_fast +; CHECK-NOTF: vector.body: +; CHECK-NOTF-NOT: %{{.*}} = phi +; CHECK-NOTF: %[[LOAD:.*]] = load +; CHECK-NOTF: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-NOTF: middle.block: +; CHECK-NOTF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[ADD]]) + +; CHECK-TF-NORED-LABEL: @fadd_red_fast +; CHECK-TF-NORED: vector.body: +; CHECK-TF-NORED-NOT: %{{.*}} = phi +; CHECK-TF-NORED: %[[LOAD:.*]] = load +; CHECK-TF-NORED: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-TF-NORED: middle.block: +; CHECK-TF-NORED-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[ADD]]) + +; CHECK-TF-NOREC-LABEL: @fadd_red_fast +; CHECK-TF-NOREC: vector.body: +; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NOREC: %[[VEC_PHI:.*]] = phi +; CHECK-TF-NOREC: %[[LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32({{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF-NOREC: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-TF-NOREC: %[[SEL:.*]] = select fast %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] +; CHECK-TF-NOREC: middle.block: +; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[SEL]]) + +; CHECK-TF-LABEL: @fadd_red_fast +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[VEC_PHI:.*]] = phi +; CHECK-TF: %[[LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32({{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-TF: %[[SEL:.*]] = select fast %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] +; CHECK-TF: middle.block: +; CHECK-TF-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[SEL]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd fast float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + +define void @add_recur(i32* noalias %dst, i32* noalias %src, i64 %n) #0 { +; CHECK-NOTF-LABEL: @add_recur +; CHECK-NOTF: entry: +; CHECK-NOTF: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-NOTF: vector.ph: +; CHECK-NOTF: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-NOTF: vector.body: +; CHECK-NOTF-NOT: %{{.*}} = phi +; CHECK-NOTF: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-NOTF: %[[LOAD]] = load +; CHECK-NOTF: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-NOTF: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-NOTF: store %[[ADD]] + +; CHECK-TF-NORED-LABEL: @add_recur +; CHECK-TF-NORED: entry: +; CHECK-TF-NORED: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-TF-NORED: vector.ph: +; CHECK-TF-NORED: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-TF-NORED: vector.body: +; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NORED: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-TF-NORED: %[[LOAD]] = call @llvm.masked.load.nxv4i32.p0nxv4i32({{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF-NORED: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-TF-NORED: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-TF-NORED: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]], {{.*}} %[[ACTIVE_LANE_MASK]]) + +; CHECK-TF-NOREC-LABEL: @add_recur +; CHECK-TF-NOREC: entry: +; CHECK-TF-NOREC: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-TF-NOREC: vector.ph: +; CHECK-TF-NOREC: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-TF-NOREC: vector.body: +; CHECK-TF-NOREC-NOT: %{{.*}} = phi +; CHECK-TF-NOREC: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-TF-NOREC: %[[LOAD]] = load +; CHECK-TF-NOREC: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-TF-NOREC: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-TF-NOREC: store %[[ADD]] + +; CHECK-TF-LABEL: @add_recur +; CHECK-TF: entry: +; CHECK-TF: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-TF: vector.ph: +; CHECK-TF: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-TF: %[[LOAD]] = call @llvm.masked.load.nxv4i32.p0nxv4i32({{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-TF: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-TF: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]], {{.*}} %[[ACTIVE_LANE_MASK]]) + +entry: + %.pre = load i32, i32* %src, align 4 + br label %for.body + +for.body: ; preds = %entry, %for.body + %0 = phi i32 [ %1, %for.body ], [ %.pre, %entry ] + %i.010 = phi i64 [ %add, %for.body ], [ 0, %entry ] + %add = add nuw nsw i64 %i.010, 1 + %arrayidx1 = getelementptr inbounds i32, i32* %src, i64 %add + %1 = load i32, i32* %arrayidx1, align 4 + %add2 = add nsw i32 %1, %0 + %arrayidx3 = getelementptr inbounds i32, i32* %dst, i64 %i.010 + store i32 %add2, i32* %arrayidx3, align 4 + %exitcond.not = icmp eq i64 %add, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4} +!1 = !{!"llvm.loop.vectorize.width", i32 4} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = !{!"llvm.loop.vectorize.enable", i1 true}