Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -514,6 +514,12 @@ TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const; + /// Query the target whether it would be prefered to create a predicated vector + /// loop, which can avoid the need to emit a scalar epilogue loop. + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT) const; + /// @} /// \name Scalar Target Information @@ -1194,6 +1200,11 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) = 0; + virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1464,6 +1475,11 @@ HardwareLoopInfo &HWLoopInfo) override { return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT) override { + return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -209,6 +209,12 @@ return false; } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT) const { + return false; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -508,6 +508,12 @@ return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT) { + return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -242,6 +242,12 @@ return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } +bool TargetTransformInfo::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT) const { + return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -200,7 +200,11 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo); - + bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -993,6 +993,39 @@ return true; } +bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, + AssumptionCache &AC, + TargetLibraryInfo *TLI, + DominatorTree *DT) { + // Low-overhead branches are only supported in the 'low-overhead branch' + // extension of v8.1-m. + if (!ST->hasLOB() || DisableLowOverheadLoops) + return false; + + HardwareLoopInfo HWLoopInfo(L); + if (!HWLoopInfo.canAnalyze(*LI)) + return false; + + // If we don't create a hardware-loop, we can't create a tail-predicated + // hardware-loop. + if (!isHardwareLoopProfitable(L, SE, AC, TLI, HWLoopInfo)) + return false; + + if (!HWLoopInfo.isHardwareLoopCandidate(SE, *LI, *DT)) + return false; + + // TODO: to set up a tail-predicated loop, which works by setting up + // the total number of elements processed by the loop, we need to + // determine the element size here, and if it is uniform for all operations + // in the vector loop. This means we will reject narrowing/widening + // operations, and don't want to predicate the vector loop, which is + // the main prep step for tail-predicated loops. + + return false; +} + + void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7434,13 +7434,17 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, - ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AssumptionCache *AC, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT) { ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && (F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) SEL = CM_ScalarEpilogueNotAllowedOptSize; - else if (PreferPredicateOverEpilog || Hints.getPredicate()) + else if (PreferPredicateOverEpilog || Hints.getPredicate() || + TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT)) SEL = CM_ScalarEpilogueNotNeededUsePredicate; return SEL; @@ -7460,7 +7464,9 @@ assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, + PSE.getSE(), DT); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -7552,7 +7558,9 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, + PSE.getSE(), DT); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll @@ -7,6 +7,37 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-unknown-eabihf" +define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: tail_folding( +; CHECK: vector.body: +; +; This needs implementation of TTI::preferPredicateOverEpilogue, +; then this will be tail-folded too: +; +; CHECK-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; CHECK-NOT: call void @llvm.masked.store.v4i32.p0v4i32( +; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 430 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + + define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { ; COMMON-LABEL: tail_folding_enabled( ; COMMON: vector.body: @@ -50,7 +81,7 @@ ; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32( ; PREDFLAG: %index.next = add i64 %index, 4 ; PREDFLAG: %12 = icmp eq i64 %index.next, 432 -; PREDFLAG: br i1 %12, label %middle.block, label %vector.body, !llvm.loop !4 +; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %for.body @@ -77,7 +108,7 @@ ; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-NEXT: !4 = distinct !{!4, !1} ; CHECK-NEXT: !5 = distinct !{!5, !3, !1} - +; CHECK-NEXT: !6 = distinct !{!6, !1} attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } !6 = distinct !{!6, !7, !8}