diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -46,6 +46,7 @@ class GlobalValue; class IntrinsicInst; class LoadInst; +class LoopAccessInfo; class Loop; class ProfileSummaryInfo; class SCEV; @@ -518,6 +519,13 @@ TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const; + /// Query the target whether it would be prefered to create a predicated vector + /// loop, which can avoid the need to emit a scalar epilogue loop. + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) const; + /// @} /// \name Scalar Target Information @@ -1201,6 +1209,12 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) = 0; + virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1471,6 +1485,12 @@ HardwareLoopInfo &HWLoopInfo) override { return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) override { + return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -213,6 +213,13 @@ return false; } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) const { + return false; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -510,6 +510,13 @@ return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) { + return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -243,6 +243,12 @@ return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } +bool TargetTransformInfo::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, const LoopAccessInfo *LAI) const { + return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -203,7 +203,12 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); - + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1000,6 +1000,50 @@ return true; } +bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT, + const LoopAccessInfo *LAI) { + // Creating a predicated vector loop is the first step for generating a + // tail-predicated hardware loop, for which we need the MVE masked + // load/stores instructions: + if (!ST->hasMVEIntegerOps()) + return false; + + HardwareLoopInfo HWLoopInfo(L); + if (!HWLoopInfo.canAnalyze(*LI)) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + "analyzable.\n"); + return false; + } + + // This checks if we have the low-overhead branch architecture + // extension, and if we will create a hardware-loop: + if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + "profitable.\n"); + return false; + } + + if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) { + LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not " + "a candidate.\n"); + return false; + } + + // TODO: to set up a tail-predicated loop, which works by setting up + // the total number of elements processed by the loop, we need to + // determine the element size here, and if it is uniform for all operations + // in the vector loop. This means we will reject narrowing/widening + // operations, and don't want to predicate the vector loop, which is + // the main prep step for tail-predicated loops. + + return false; +} + + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7423,13 +7423,18 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, - ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AssumptionCache *AC, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + const LoopAccessInfo *LAI) { ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && (F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) SEL = CM_ScalarEpilogueNotAllowedOptSize; - else if (PreferPredicateOverEpilog || Hints.getPredicate()) + else if (PreferPredicateOverEpilog || Hints.getPredicate() || + TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI)) SEL = CM_ScalarEpilogueNotNeededUsePredicate; return SEL; @@ -7449,7 +7454,10 @@ assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, + PSE.getSE(), DT, LVL->getLAI()); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -7541,7 +7549,9 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, + PSE.getSE(), DT, LVL.getLAI()); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll @@ -0,0 +1,49 @@ +; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf < %s -loop-vectorize -S | \ +; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING + +; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve < %s -loop-vectorize -enable-arm-maskedldst=true -S | \ +; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING + +; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=false -S | \ +; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING + +; Disabling the low-overhead branch extension will make +; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for +; these cases. +; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob < %s -loop-vectorize -enable-arm-maskedldst=true -S | \ +; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING + +; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve < %s -loop-vectorize -enable-arm-maskedldst=true -S | \ +; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING + +define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) { +; CHECK-LABEL: tail_folding( +; +; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( +; +; TODO: this needs implementation of TTI::preferPredicateOverEpilogue, +; then this will be tail-folded too: +; +; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 430 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll @@ -7,6 +7,37 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-unknown-eabihf" +define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: tail_folding( +; CHECK: vector.body: +; +; This needs implementation of TTI::preferPredicateOverEpilogue, +; then this will be tail-folded too: +; +; CHECK-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; CHECK-NOT: call void @llvm.masked.store.v4i32.p0v4i32( +; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 430 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + + define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { ; COMMON-LABEL: tail_folding_enabled( ; COMMON: vector.body: @@ -50,7 +81,7 @@ ; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32( ; PREDFLAG: %index.next = add i64 %index, 4 ; PREDFLAG: %12 = icmp eq i64 %index.next, 432 -; PREDFLAG: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !4 +; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %for.body @@ -77,7 +108,7 @@ ; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-NEXT: !4 = distinct !{!4, !1} ; CHECK-NEXT: !5 = distinct !{!5, !3, !1} - +; CHECK-NEXT: !6 = distinct !{!6, !1} attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } !6 = distinct !{!6, !7, !8}