Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -540,7 +540,8 @@ /// vector loop, which can avoid the need to emit a scalar epilogue loop. bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) const; + InterleavedAccessInfo *IAI, + bool NeedsReversePred) const; /// Query the target what the preferred style of tail folding is. /// \param IVUpdateMayOverflow Tells whether it is known if the IV update @@ -1651,7 +1652,8 @@ HardwareLoopInfo &HWLoopInfo) = 0; virtual bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) = 0; + InterleavedAccessInfo *IAI, + bool NeedsReversePred) = 0; virtual TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0; virtual std::optional instCombineIntrinsic( @@ -2049,8 +2051,9 @@ } bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) override { - return Impl.preferPredicateOverEpilogue(TLI, LVL, IAI); + InterleavedAccessInfo *IAI, + bool NeedsReversePred) override { + return Impl.preferPredicateOverEpilogue(TLI, LVL, IAI, NeedsReversePred); } TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -165,7 +165,8 @@ bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) const { + InterleavedAccessInfo *IAI, + bool NeedsReversePred) const { return false; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -624,8 +624,9 @@ bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) { - return BaseT::preferPredicateOverEpilogue(TLI, LVL, IAI); + InterleavedAccessInfo *IAI, + bool NeedsReversePred) { + return BaseT::preferPredicateOverEpilogue(TLI, LVL, IAI, NeedsReversePred); } TailFoldingStyle Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -347,6 +347,11 @@ /// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965). int isConsecutivePtr(Type *AccessTy, Value *Ptr) const; + /// This function returns true if it encounters a load or store in the loop + /// that contains an address that is consecutive and decreasing. It calls + /// isConsecutivePtr to determine this. + bool containsDecreasingPointers(); + /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V) const; Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -307,8 +307,8 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) const { - return TTIImpl->preferPredicateOverEpilogue(TLI, LVL, IAI); + InterleavedAccessInfo *IAI, bool NeedsReversePred) const { + return TTIImpl->preferPredicateOverEpilogue(TLI, LVL, IAI, NeedsReversePred); } TailFoldingStyle TargetTransformInfo::getPreferredTailFoldingStyle( Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -358,7 +358,8 @@ bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI); + InterleavedAccessInfo *IAI, + bool NeedsReversePred); bool supportsScalableVectors() const { return ST->hasSVE(); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -49,8 +49,9 @@ TFDisabled = 0x0, TFReductions = 0x01, TFRecurrences = 0x02, + TFReverse = 0x04, TFSimple = 0x80, - TFAll = TFReductions | TFRecurrences | TFSimple + TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple }; void operator=(const std::string &Val) { @@ -71,10 +72,14 @@ add(TFReductions); else if (TailFoldType == "recurrences") add(TFRecurrences); + else if (TailFoldType == "reverse") + add(TFReverse); else if (TailFoldType == "noreductions") remove(TFReductions); else if (TailFoldType == "norecurrences") remove(TFRecurrences); + else if (TailFoldType == "noreverse") + remove(TFReverse); else { errs() << "invalid argument " << TailFoldType.str() @@ -106,7 +111,9 @@ "recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nrecurrences Use tail-folding for loops containing fixed order " - "recurrences"), + "recurrences" + "\nreverse Use tail-folding for loops requiring reversed " + "predicates"), cl::location(TailFoldingKindLoc)); // Experimental option that will only be fully functional when the @@ -3381,7 +3388,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) { + InterleavedAccessInfo *IAI, + bool NeedsReversePred) { if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) return false; @@ -3396,6 +3404,8 @@ Required.add(TailFoldingKind::TFReductions); if (LVL->getFixedOrderRecurrences().size()) Required.add(TailFoldingKind::TFRecurrences); + if (NeedsReversePred) + Required.add(TailFoldingKind::TFReverse); if (!Required) Required.add(TailFoldingKind::TFSimple); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -305,7 +305,8 @@ HardwareLoopInfo &HWLoopInfo); bool preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI); + InterleavedAccessInfo *IAI, + bool NeedsReversePred); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2240,7 +2240,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TargetLibraryInfo *TLI, LoopVectorizationLegality *LVL, - InterleavedAccessInfo *IAI) { + InterleavedAccessInfo *IAI, + bool NeedsReversePred) { if (!EnableTailPredication) { LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); return false; Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -652,6 +652,22 @@ return Scalarize; } +bool LoopVectorizationLegality::containsDecreasingPointers() { + for (BasicBlock *BB : TheLoop->blocks()) { + // Scan the instructions in the block and look for addresses that are + // consecutive and decreasing. + for (Instruction &I : *BB) { + if (isa(&I) || isa(&I)) { + Value *Ptr = getLoadStorePointerOperand(&I); + Type *ScalarTy = getLoadStoreType(&I); + if (isConsecutivePtr(ScalarTy, Ptr) == -1) + return true; + } + } + } + return false; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *Header = TheLoop->getHeader(); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9791,8 +9791,13 @@ return CM_ScalarEpilogueAllowed; }; + // We call this to discover whether any load/store pointers in the loop have + // negative strides. This will require extra work to reverse the loop + // predicate, which may be expensive. + bool NeedsReversePred = LVL.containsDecreasingPointers(); + // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(TLI, &LVL, IAI)) + if (TTI->preferPredicateOverEpilogue(TLI, &LVL, IAI, NeedsReversePred)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -1,9 +1,10 @@ ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled -S | FileCheck %s -check-prefix=CHECK-NOTF ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=default -S | FileCheck %s -check-prefix=CHECK-NOTF ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all -S | FileCheck %s -check-prefix=CHECK-TF -; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences -S | FileCheck %s -check-prefix=CHECK-TF +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=disabled+simple+reductions+recurrences+reverse -S | FileCheck %s -check-prefix=CHECK-TF ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreductions -S | FileCheck %s -check-prefix=CHECK-TF-NORED ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+norecurrences -S | FileCheck %s -check-prefix=CHECK-TF-NOREC +; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=all+noreverse -S | FileCheck %s -check-prefix=CHECK-TF-NOREV ; RUN: opt -opaque-pointers=0 < %s -passes=loop-vectorize -sve-tail-folding=reductions -S | FileCheck %s -check-prefix=CHECK-TF-ONLYRED target triple = "aarch64-unknown-linux-gnu" @@ -33,6 +34,14 @@ ; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi ; CHECK-TF-NOREC: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF-NOREV-LABEL: @simple_memset( +; CHECK-TF-NOREV: vector.ph: +; CHECK-TF-NOREV: %[[INSERT:.*]] = insertelement poison, i32 %val, i64 0 +; CHECK-TF-NOREV: %[[SPLAT:.*]] = shufflevector %[[INSERT]], poison, zeroinitializer +; CHECK-TF-NOREV: vector.body: +; CHECK-TF-NOREV: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NOREV: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[SPLAT]], {{.*}} %[[ACTIVE_LANE_MASK]] + ; CHECK-TF-LABEL: @simple_memset( ; CHECK-TF: vector.ph: ; CHECK-TF: %[[INSERT:.*]] = insertelement poison, i32 %val, i64 0 @@ -91,6 +100,16 @@ ; CHECK-TF-NOREC: middle.block: ; CHECK-TF-NOREC-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[SEL]]) +; CHECK-TF-NOREV-LABEL: @fadd_red_fast +; CHECK-TF-NOREV: vector.body: +; CHECK-TF-NOREV: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NOREV: %[[VEC_PHI:.*]] = phi +; CHECK-TF-NOREV: %[[LOAD:.*]] = call @llvm.masked.load.nxv4f32.p0nxv4f32({{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF-NOREV: %[[ADD:.*]] = fadd fast %[[LOAD]] +; CHECK-TF-NOREV: %[[SEL:.*]] = select fast %[[ACTIVE_LANE_MASK]], %[[ADD]], %[[VEC_PHI]] +; CHECK-TF-NOREV: middle.block: +; CHECK-TF-NOREV-NEXT: call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, %[[SEL]]) + ; CHECK-TF-LABEL: @fadd_red_fast ; CHECK-TF: vector.body: ; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi @@ -167,6 +186,19 @@ ; CHECK-TF-NOREC: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] ; CHECK-TF-NOREC: store %[[ADD]] +; CHECK-TF-NOREV-LABEL: @add_recur +; CHECK-TF-NOREV: entry: +; CHECK-TF-NOREV: %[[PRE:.*]] = load i32, i32* %src, align 4 +; CHECK-TF-NOREV: vector.ph: +; CHECK-TF-NOREV: %[[RECUR_INIT:.*]] = insertelement poison, i32 %[[PRE]] +; CHECK-TF-NOREV: vector.body: +; CHECK-TF-NOREV: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NOREV: %[[VECTOR_RECUR:.*]] = phi [ %[[RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ] +; CHECK-TF-NOREV: %[[LOAD]] = call @llvm.masked.load.nxv4i32.p0nxv4i32({{.*}} %[[ACTIVE_LANE_MASK]] +; CHECK-TF-NOREV: %[[SPLICE:.*]] = call @llvm.experimental.vector.splice.nxv4i32( %[[VECTOR_RECUR]], %[[LOAD]], i32 -1) +; CHECK-TF-NOREV: %[[ADD:.*]] = add nsw %[[LOAD]], %[[SPLICE]] +; CHECK-TF-NOREV: call void @llvm.masked.store.nxv4i32.p0nxv4i32( %[[ADD]], {{.*}} %[[ACTIVE_LANE_MASK]]) + ; CHECK-TF-LABEL: @add_recur ; CHECK-TF: entry: ; CHECK-TF: %[[PRE:.*]] = load i32, i32* %src, align 4 @@ -238,6 +270,12 @@ ; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> ; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF-NOREV-LABEL: @interleave( +; CHECK-TF-NOREV: vector.body: +; CHECK-TF-NOREV: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF-NOREV: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + entry: br label %for.body @@ -266,6 +304,55 @@ ret void } +define void @reverse(double* noalias %dst, double* noalias %src) #0 { +; CHECK-NOTF-LABEL: @reverse( +; CHECK-NOTF: vector.body: +; CHECK-NOTF-NOT: %{{.*}} = phi +; CHECK-NOTF: %[[LOAD:.*]] = load , * %18, align 8 +; CHECK-NOTF: %{{.*}} = call @llvm.experimental.vector.reverse.nxv2f64( %[[LOAD]]) + +; CHECK-TF-NOREV-LABEL: @reverse( +; CHECK-TF-NOREV: vector.body: +; CHECK-TF-NOREV-NOT: %{{.*}} = phi +; CHECK-TF-NOREV: %[[LOAD:.*]] = load , * %18, align 8 +; CHECK-TF-NOREV: %{{.*}} = call @llvm.experimental.vector.reverse.nxv2f64( %[[LOAD]]) + +; CHECK-TF-LABEL: @reverse( +; CHECK-TF: vector.body: +; CHECK-TF: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF: %[[REVERSE_MASK:.*]] = call @llvm.experimental.vector.reverse.nxv2i1( %[[ACTIVE_LANE_MASK]]) +; CHECK-TF: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv2f64.p0nxv2f64({{.*}} %reverse + +; CHECK-TF-NORED-LABEL: @reverse( +; CHECK-TF-NORED: vector.body: +; CHECK-TF-NORED: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NORED: %[[REVERSE_MASK:.*]] = call @llvm.experimental.vector.reverse.nxv2i1( %[[ACTIVE_LANE_MASK]]) +; CHECK-TF-NORED: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv2f64.p0nxv2f64({{.*}} %reverse + +; CHECK-TF-NOREC-LABEL: @reverse( +; CHECK-TF-NOREC: vector.body: +; CHECK-TF-NOREC: %[[ACTIVE_LANE_MASK:.*]] = phi +; CHECK-TF-NOREC: %[[REVERSE_MASK:.*]] = call @llvm.experimental.vector.reverse.nxv2i1( %[[ACTIVE_LANE_MASK]]) +; CHECK-TF-NOREC: %[[MASKED_LOAD:.*]] = call @llvm.masked.load.nxv2f64.p0nxv2f64({{.*}} %reverse + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, double* %src, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 8 + %add = fadd double %0, 1.000000e+00 + %arrayidx2 = getelementptr inbounds double, double* %dst, i64 %indvars.iv + store double %add, double* %arrayidx2, align 8 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp.not = icmp eq i64 %indvars.iv, 0 + br i1 %cmp.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + attributes #0 = { "target-features"="+sve" } !0 = distinct !{!0, !1, !2, !3, !4}