Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1465,6 +1465,10 @@ /// to a stack reload. unsigned getGISelRematGlobalCost() const; + /// \returns the lower bound of a trip count to decide on vectorization + /// while tail-folding. + unsigned getMinTripCountTailFoldingThreshold() const; + /// \returns True if the target supports scalable vectors. bool supportsScalableVectors() const; @@ -1863,6 +1867,7 @@ ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; + virtual unsigned getMinTripCountTailFoldingThreshold() const = 0; virtual bool enableScalableVectorization() const = 0; virtual bool supportsScalableVectors() const = 0; virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType, @@ -2502,6 +2507,10 @@ return Impl.getGISelRematGlobalCost(); } + unsigned getMinTripCountTailFoldingThreshold() const override { + return Impl.getMinTripCountTailFoldingThreshold(); + } + bool supportsScalableVectors() const override { return Impl.supportsScalableVectors(); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -817,6 +817,8 @@ unsigned getGISelRematGlobalCost() const { return 1; } + unsigned getMinTripCountTailFoldingThreshold() const { return 0; } + bool supportsScalableVectors() const { return false; } bool enableScalableVectorization() const { return false; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1132,6 +1132,10 @@ return TTIImpl->getGISelRematGlobalCost(); } +unsigned TargetTransformInfo::getMinTripCountTailFoldingThreshold() const { + return TTIImpl->getMinTripCountTailFoldingThreshold(); +} + bool TargetTransformInfo::supportsScalableVectors() const { return TTIImpl->supportsScalableVectors(); } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -335,6 +335,10 @@ return 2; } + unsigned getMinTripCountTailFoldingThreshold() const { + return ST->hasSVE() ? 5 : 0; + } + PredicationStyle emitGetActiveLaneMask() const { if (ST->hasSVE()) return PredicationStyle::DataAndControlFlow; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -10145,8 +10145,19 @@ if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { - LLVM_DEBUG(dbgs() << "\n"); - SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { + LLVM_DEBUG(dbgs() << "\n"); + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; + } else { + LLVM_DEBUG(dbgs() << "But the target considers the trip count too " + "small to consider vectorizing.\n"); + reportVectorizationFailure( + "The trip count is below the minial threshold value.", + "loop trip count is too low, avoiding vectorization", + "LowTripCount", ORE, L); + Hints.emitRemarkWithHints(); + return false; + } } } Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -40,18 +40,22 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] -; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 5) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: br i1 true, label %middle.block, label %vector.body +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[I_08]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; entry: br label %for.body