diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5133,14 +5133,6 @@ } } - // For scalable vectors don't use tail folding for low trip counts or - // optimizing for code size. We only permit this if the user has explicitly - // requested it. - if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate && - ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate && - MaxFactors.ScalableVF.isVector()) - MaxFactors.ScalableVF = ElementCount::getScalable(0); - // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -0,0 +1,73 @@ +; RUN: opt -loop-vectorize -S < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @trip7_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip7_i64( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 7) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %src, i64 %i.06 + %0 = load i64, i64* %arrayidx, align 8 + %mul = shl nsw i64 %0, 1 + %arrayidx1 = getelementptr inbounds i64, i64* %dst, i64 %i.06 + %1 = load i64, i64* %arrayidx1, align 8 + %add = add nsw i64 %1, %mul + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, 7 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip5_i8( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 {{%.*}}, i64 5) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08 + %1 = load i8, i8* %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, i8* %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 5 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -0,0 +1,39 @@ +; RUN: opt -loop-vectorize -S < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @trip1024_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip1024_i64( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 1024) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} +; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %src, i64 %i.06 + %0 = load i64, i64* %arrayidx, align 8 + %mul = shl nsw i64 %0, 1 + %arrayidx1 = getelementptr inbounds i64, i64* %dst, i64 %i.06 + %1 = load i64, i64* %arrayidx1, align 8 + %add = add nsw i64 %1, %mul + store i64 %add, i64* %arrayidx1, align 8 + %inc = add nuw nsw i64 %i.06, 1 + %exitcond.not = icmp eq i64 %inc, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" optsize } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -0,0 +1,33 @@ +; RUN: opt -loop-vectorize -riscv-v-vector-bits-min=128 -scalable-vectorization=on -force-target-instruction-cost=1 -S < %s | FileCheck %s + +target triple = "riscv64" + +define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { +; CHECK-LABEL: @trip5_i8( +; CHECK: vector.body: +; CHECK: [[ACTIVE_LANE_MASK:%.*]] = icmp ule {{%.*}}, shufflevector ( insertelement ( poison, i64 4, i32 0), poison, zeroinitializer) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv8i8.p0nxv8i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK: {{%.*}} = call @llvm.masked.load.nxv8i8.p0nxv8i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK: call void @llvm.masked.store.nxv8i8.p0nxv8i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) +; +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08 + %0 = load i8, i8* %arrayidx, align 1 + %mul = shl i8 %0, 1 + %arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08 + %1 = load i8, i8* %arrayidx1, align 1 + %add = add i8 %mul, %1 + store i8 %add, i8* %arrayidx1, align 1 + %inc = add nuw nsw i64 %i.08, 1 + %exitcond.not = icmp eq i64 %inc, 5 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +attributes #0 = { "target-features"="+v,+d" }