Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -478,6 +478,11 @@ DominatorTree *DT, const LoopAccessInfo *LAI) const; + /// Query the target whether lowering of the llvm.get.active.mask intrinsic + /// is supported and emitting it is desired for this loop. + bool emitGetActiveMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) const; + /// @} /// \name Scalar Target Information @@ -1228,6 +1233,8 @@ preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) = 0; + virtual bool emitGetActiveMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, @@ -1530,6 +1537,10 @@ const LoopAccessInfo *LAI) override { return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } + bool emitGetActiveMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) override { + return Impl.emitGetActiveMask(L, LI, SE, TailFolded); + } bool isLegalAddImmediate(int64_t Imm) override { return Impl.isLegalAddImmediate(Imm); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -144,6 +144,11 @@ return false; } + bool emitGetActiveMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFold) const { + return false; + } + void getUnrollingPreferences(Loop *, ScalarEvolution &, TTI::UnrollingPreferences &) {} Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -498,6 +498,11 @@ return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } + bool emitGetActiveMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFold) { + return BaseT::emitGetActiveMask(L, LI, SE, TailFold); + } + int getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1414,6 +1414,11 @@ def int_set_loop_iterations : Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>; +// Specify the mask of active/inactive lanes. +def int_get_active_mask: + Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty], [IntrNoDuplicate]>; + // Specify that the value given is the number of iterations that the next loop // will execute. Also test that the given count is not zero, allowing it to // control entry to a 'while' loop. Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,11 @@ return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); } +bool TargetTransformInfo::emitGetActiveMask(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, bool TailFolded) const { + return TTIImpl->emitGetActiveMask(L, LI, SE, TailFolded); +} + void TargetTransformInfo::getUnrollingPreferences( Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const { return TTIImpl->getUnrollingPreferences(L, SE, UP); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -250,6 +250,9 @@ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + bool emitGetActiveMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + bool TailFolded) const; + bool shouldBuildLookupTablesForConstant(Constant *C) const { // In the ROPI and RWPI relocation models we can't have pointers to global // variables or functions in constant data, so don't convert switches to Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1384,7 +1384,16 @@ return canTailPredicateLoop(L, LI, SE, DL, LAI); } - +bool ARMTTIImpl::emitGetActiveMask(Loop *L, LoopInfo *LI, + ScalarEvolution &SE, bool TailFolded) const { + // If this loop is tail-folded, we always want to emit the + // llvm.set.loop.elements intrinsic, so that this can be picked up in the + // MVETailPredication pass that needs to know the number of elements + // processed by this vector loop. + if (TailFolded) + return true; + return false; +} void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6826,7 +6826,11 @@ IV = IVRecipe->getVPValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + if (CM.TTI.emitGetActiveMask(CM.TheLoop, CM.LI, *CM.PSE.getSE(), + !CM.isScalarEpilogueAllowed())) + BlockMask = Builder.createNaryOp(VPInstruction::ActiveMask, {IV, BTC}); + else + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; } Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -675,6 +675,7 @@ ICmpULE, SLPLoad, SLPStore, + ActiveMask, }; private: Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -380,6 +380,19 @@ State.set(this, V, Part); break; } + case VPInstruction::ActiveMask: { + Value *IV = State.get(getOperand(0), Part); + Value *TC = State.get(getOperand(1), Part); + Value *V = Builder.CreateICmpULE(IV, TC); + + BasicBlock *BB = Builder.GetInsertBlock(); + Function *Decl = Intrinsic::getDeclaration( + BB->getParent()->getParent(), Intrinsic::get_active_mask, + { V->getType(), V->getType() }); + Value *Call = Builder.CreateCall(Decl, { V }); + State.set(this, Call, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll @@ -41,11 +41,14 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { ; COMMON-LABEL: tail_folding_enabled( ; COMMON: vector.body: -; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( -; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; COMMON: %induction = add <4 x i64> %broadcast.splat, +; COMMON: %[[PRED:.*]] = icmp ule <4 x i64> %induction, +; COMMON: %[[MASK:.*]] = call <4 x i1> @llvm.get.active.mask.v4i1.v4i1(<4 x i1> %[[PRED]]) +; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %[[MASK]] +; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %[[MASK]] ; COMMON: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]] -; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]] -; COMMON: br i1 %12, label %{{.*}}, label %vector.body +; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %[[MASK]] +; COMMON: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body @@ -75,13 +78,16 @@ ; PREDFLAG-LABEL: tail_folding_disabled( ; PREDFLAG: vector.body: -; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( -; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; PREDFLAG: %induction = add <4 x i64> %broadcast.splat, +; PREDFLAG: %[[PRED:.*]] = icmp ule <4 x i64> %induction, +; PREDFLAG: %[[MASK:.*]] = call <4 x i1> @llvm.get.active.mask.v4i1.v4i1(<4 x i1> %[[PRED]]) +; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[MASK]] +; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[MASK]] ; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load -; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32( +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[MASK]] ; PREDFLAG: %index.next = add i64 %index, 4 -; PREDFLAG: %12 = icmp eq i64 %index.next, 432 -; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6 +; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432 +; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %for.body