Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -189,6 +189,11 @@ cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization.")); +static cl::opt MinTripCountTailFoldingThreshold( + "vectorizer-min-trip-count-tail-folding", cl::init(5), cl::Hidden, + cl::desc("Minimal trip count constant value to ignore tail folding " + "for scalable vectors.")); + /// Loops with a known constant trip count below this number are vectorized only /// if no scalar iteration overheads are incurred. static cl::opt TinyTripCountVectorThreshold( @@ -5069,6 +5074,12 @@ } } + if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && + ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && + TC <= MinTripCountTailFoldingThreshold && + MaxFactors.ScalableVF.isVector()) + MaxFactors.ScalableVF = ElementCount::getScalable(0); + // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -40,18 +40,22 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] -; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 5) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -6,53 +6,40 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 -6, [[TMP1]] -; CHECK-NEXT: br i1 [[TMP2]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP7]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP8]], i64 5) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0nxv8i8(* [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0nxv8i8(* [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP12]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0nxv8i8( [[TMP16]], * [[TMP17]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 [[TMP0]], i64 5) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP3]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = shl <8 x i8> [[WIDE_MASKED_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP7]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i8> [[TMP4]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP8]], <8 x i8>* [[TMP9]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP20]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP10]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP21]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP11]] ; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5