Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7074,10 +7074,16 @@ } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); - if (TailFolded && CM.TTI.emitGetActiveLaneMask()) - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC}); - else + + if (TailFolded && CM.TTI.emitGetActiveLaneMask()) { + // While ActiveLaneMask is a binary op that consumes the loop tripcount + // as a second argument, we only pass the IV here and extract the + // tripcount from the transform state where codegen of the VP instructions + // happen. + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); + } else { BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); + } return BlockMaskCache[BB] = BlockMask; } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -383,14 +383,14 @@ case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), {Part, 0}); - // Get first lane of backedge-taken-count. - Value *ScalarBTC = State.get(getOperand(1), {Part, 0}); + // Get the original loop tripcount. + Value *ScalarTC = State.TripCount; auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); auto *PredTy = FixedVectorType::get(Int1Ty, State.VF); Instruction *Call = Builder.CreateIntrinsic( - Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()}, - {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask"); + Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, + {VIVElem0, ScalarTC}, nullptr, "active.lane.mask"); State.set(this, Call, Part); break; } Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -14,7 +14,7 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] @@ -62,7 +62,7 @@ ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] Index: llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll +++ llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll @@ -47,7 +47,7 @@ ; PREFER-FOLDING: vector.body: ; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 -; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) +; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431) ; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, ; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask, ; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask @@ -331,7 +331,7 @@ ; PREFER-FOLDING: vector.body: ; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 -; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430) +; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431) ; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask ; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask ; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -18,8 +18,7 @@ ; COMMON: vector.body: ; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0 -; CHECK-TF: %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0 -; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]]) +; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %N) ; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask ; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask ; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask) Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-loop-hint.ll @@ -41,7 +41,7 @@ ; CHECK: vector.body: ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %[[ELEM0:.*]] = add i64 %index, 0 -; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429) +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430) ; CHECK: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask ; CHECK: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask ; CHECK: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]] Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll @@ -20,7 +20,7 @@ ; PREDFLAG: vector.body: ; PREDFLAG: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; PREDFLAG: %[[ELEM0:.*]] = add i64 %index, 0 -; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429) +; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430) ; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask ; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load @@ -54,11 +54,10 @@ ; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4 ; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8 ; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12 -; PREDFLAG: %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0 -; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]]) -; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]]) -; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]]) -; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]]) +; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N) +; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N) +; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N) +; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %N) ; ; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}} ; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}