diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1413,14 +1413,14 @@ } bool ARMTTIImpl::emitGetActiveLaneMask() const { - if (!ST->hasMVEIntegerOps()) + if (!ST->hasMVEIntegerOps() || DisableTailPredication) return false; - // TODO: Intrinsic @llvm.get.active.lane.mask is supported. + // Intrinsic @llvm.get.active.lane.mask is supported. // It is used in the MVETailPredication pass, which requires the number of // elements processed by this vector loop to setup the tail-predicated // loop. - return false; + return true; } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6829,7 +6829,11 @@ IV = IVRecipe->getVPValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + bool TailFolded = !CM.isScalarEpilogueAllowed(); + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); + else + BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -685,6 +685,7 @@ ICmpULE, SLPLoad, SLPStore, + ActiveLaneMask, }; private: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -380,6 +380,20 @@ State.set(this, V, Part); break; } + case VPInstruction::ActiveLaneMask: { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); + // Get first lane of backedge-taken-count. + Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); + + auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); + auto *PredTy = VectorType::get(Int1Ty, State.VF); + Instruction *Call = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, + {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); + State.set(this, Call, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -421,6 +435,10 @@ case VPInstruction::SLPStore: O << "combined store"; break; + case VPInstruction::ActiveLaneMask: + O << "active lane mask"; + break; + default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll @@ -45,9 +45,12 @@ define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: prefer_folding( ; PREFER-FOLDING: vector.body: -; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 -; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32 -; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32 +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) +; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, +; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, +; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body ; ; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( @@ -507,9 +510,13 @@ define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: float( ; PREFER-FOLDING: vector.body: -; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32 -; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32 -; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32 +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) +; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask +; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask +; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask +; PREFER-FOLDING: %index.next = add i32 %index, 4 ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -15,9 +15,13 @@ define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 { ; COMMON-LABEL: @sgt_loopguard( ; COMMON: vector.body: -; CHECK-TF: masked.load -; CHECK-TF: masked.load -; CHECK-TF: masked.store + +; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0 +; CHECK-TF: %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0 +; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]]) +; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask +; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask +; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask) entry: %cmp5 = icmp sgt i32 %N, 0 br i1 %cmp5, label %while.body.preheader, label %while.end diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll @@ -41,11 +41,15 @@ define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { ; COMMON-LABEL: tail_folding_enabled( ; COMMON: vector.body: -; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( -; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; COMMON: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; COMMON: %[[ELEM0:.*]] = add i64 %index, 0 +; COMMON: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429) +; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask +; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask ; COMMON: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]] -; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]] -; COMMON: br i1 %12, label %{{.*}}, label %vector.body +; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask +; COMMON: %index.next = add i64 %index, 4 +; COMMON: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body @@ -75,13 +79,16 @@ ; PREDFLAG-LABEL: tail_folding_disabled( ; PREDFLAG: vector.body: -; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( -; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; PREDFLAG: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREDFLAG: %[[ELEM0:.*]] = add i64 %index, 0 +; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429) +; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask +; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load -; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32( +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREDFLAG: %index.next = add i64 %index, 4 -; PREDFLAG: %12 = icmp eq i64 %index.next, 432 -; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6 +; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432 +; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6 entry: br label %for.body @@ -102,6 +109,59 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 } +define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +; PREDFLAG-LABEL: interleave4( +; PREDFLAG: %[[ADD1:.*]] = add i32 %index, 0 +; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4 +; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8 +; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12 +; PREDFLAG: %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0 +; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]]) +; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]]) +; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]]) +; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]]) +; +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}} +; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}} +; +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]]) +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]]) +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]]) +; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]]) +; +entry: + %cmp8 = icmp sgt i32 %N, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 + store i32 %add, i32* %arrayidx2, align 4 + %inc = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14 +} + ; CHECK: !0 = distinct !{!0, !1} ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-NEXT: !2 = distinct !{!2, !3, !1} @@ -109,6 +169,7 @@ ; CHECK-NEXT: !4 = distinct !{!4, !1} ; CHECK-NEXT: !5 = distinct !{!5, !3, !1} ; CHECK-NEXT: !6 = distinct !{!6, !1} + attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } !6 = distinct !{!6, !7, !8} @@ -118,3 +179,6 @@ !10 = distinct !{!10, !11, !12} !11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} !12 = !{!"llvm.loop.vectorize.enable", i1 true} + +!14 = distinct !{!14, !15} +!15 = !{!"llvm.loop.interleave.count", i32 4}