Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -472,6 +472,11 @@ DominatorTree *DT, const LoopAccessInfo *LAI) const; + /// Query the target whether lowering of the llvm.set.loop.elements intrinsic + /// is supported and desired for this loop. + bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) const; + /// @} /// \name Scalar Target Information @@ -1195,6 +1200,8 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; + virtual bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1472,6 +1479,10 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } + bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) override { + return Impl.emitNumElementsVecLoop(L, LI, SE, TailFolded); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -143,6 +143,11 @@ return false; } + bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFold) const { + return false; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -492,6 +492,11 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } + bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFold) { + return BaseT::emitNumElementsVecLoop(L, LI, SE, TailFold); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1413,6 +1413,11 @@ def int_set_loop_iterations : Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; +// Specify the number of elements processed by this (vector) loop, which +// typically corresponds to the iteration count of the scalar loop. +def int_set_loop_elements: + Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; + // Specify that the value given is the number of iterations that the next loop // will execute. Also test that the given count is not zero, allowing it to // control entry to a 'while' loop. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -233,6 +233,11 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } +bool TargetTransformInfo::emitNumElementsVecLoop(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, bool TailFolded) const { + return TTIImpl->emitNumElementsVecLoop(L, LI, SE, TailFolded); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -237,9 +237,13 @@ TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI); + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) const; + bool shouldBuildLookupTablesForConstant(Constant *C) const { // In the ROPI and RWPI relocation models we can't have pointers to global // variables or functions in constant data, so don't convert switches to Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1309,7 +1309,16 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } - +bool ARMTTIImpl::emitNumElementsVecLoop(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, bool TailFolded) const { + // If this loop is tail-folded, we always want to to emit the + // llvm.set.loop.elements intrinsic, so that this can be picked up in the + // MVETailPredication pass that needs to know the number of elements + // processed by this vector loop. + if (TailFolded) + return true; + return false; +} void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -639,6 +639,10 @@ /// Emit bypass checks to check any memory assumptions we may have made. void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Emit the llvm.set.loop.elements IR intrinsic that models the number of + /// data elements processed by the vector loop. + void emitNumElementsVecLoop(BasicBlock *Bypass, Value *Count); + /// Compute the transformed value of Index at offset StartValue using step /// StepValue. /// For integer induction, returns StartValue + Index * StepValue. @@ -2830,6 +2834,22 @@ LVer->prepareNoAliasMetadata(); } +void InnerLoopVectorizer::emitNumElementsVecLoop(BasicBlock *Bypass, + Value *Count) { + if (EnableVPlanNativePath) + return; + + if (!TTI->emitNumElementsVecLoop(OrigLoop, LI, *PSE.getSE(), + Cost->foldTailByMasking())) + return; + + IRBuilder<> Builder(Bypass->getTerminator()); + Function *NumElems = Intrinsic::getDeclaration( + Bypass->getParent()->getParent(), Intrinsic::set_loop_elements, + Count->getType()); + Builder.CreateCall(NumElems, Count); +} + Value *InnerLoopVectorizer::emitTransformedIndex( IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, const InductionDescriptor &ID) const { @@ -3024,6 +3044,12 @@ // faster. emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + // Emit an intrinsic in the vector preheader that represents the number + // of data elements processed by the vector loop, which corresponds to + // the tripcount of the scalar loop. This queries TTI to check that this + // intrinsic is supported by the target. + emitNumElementsVecLoop(LoopVectorPreHeader, Count); + // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -14,6 +14,10 @@ ; define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { ; COMMON-LABEL: @sgt_loopguard( +; CHECK-TF: %[[TC:.*]] = sub i32 %0, %[[SMIN:.*]] +; CHECK-TF: br i1 false, label %scalar.ph, label %vector.ph +; CHECK-TF: vector.ph: +; CHECK-TF: call void @llvm.set.loop.elements.i32(i32 %[[TC]]) ; COMMON: vector.body: ; CHECK-TF: masked.load ; CHECK-TF: masked.load Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll @@ -10,10 +10,7 @@ define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: tail_folding( ; CHECK: vector.body: -; -; This needs implementation of TTI::preferPredicateOverEpilogue, -; then this will be tail-folded too: -; +; CHECK-NOT: call void @llvm.set.loop.elements ; CHECK-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( ; CHECK-NOT: call void @llvm.masked.store.v4i32.p0v4i32( ; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body @@ -40,6 +37,9 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { ; COMMON-LABEL: tail_folding_enabled( +; COMMON: vector.ph: +; COMMON: call void @llvm.set.loop.elements.i64(i64 430) +; COMMON: br label %vector.body ; COMMON: vector.body: ; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( ; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(