diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5182,7 +5182,7 @@ unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, ElementCount MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); - TypeSize WidestRegister = TTI.getRegisterBitWidth( + const TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector); @@ -5209,9 +5209,14 @@ return ElementCount::getFixed(1); } - const auto TripCountEC = ElementCount::getFixed(ConstTripCount); - if (ConstTripCount && - ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && + unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); + if (MaxVectorElementCount.isScalable() && + TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + WidestRegisterMinEC *= Min; + } + if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC && (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { // If loop trip count (TC) is known at compile time there is no point in // choosing VF greater than TC (as done in the loop below). Select maximum diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -1,10 +1,63 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=riscv64 -mattr=+v -passes=loop-vectorize < %s | FileCheck %s -; FIXME: Using a <4 x i32> would be strictly better than tail folded -; scalable vectorization in this case. -define void @small_trip_count(i32* nocapture %a) nounwind vscale_range(4,1024) { -; CHECK-LABEL: @small_trip_count( +define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_range(4,1024) { +; CHECK-LABEL: @small_trip_count_min_vlen_128( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]] +; CHECK-NEXT: [[V:%.*]] = load i32, i32* [[GEP]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[V]], 1 +; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ] + %gep = getelementptr inbounds i32, i32* %a, i32 %iv + %v = load i32, i32* %gep, align 4 + %add = add nsw i32 %v, 1 + store i32 %add, i32* %gep, align 4 + %iv.next = add i32 %iv, 1 + %cond = icmp eq i32 %iv, 3 + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; Note: This test uses a vscale_range starting at 1, which is technically incompatibile +; with +v. If we expose a target hook for minimum vlen, this example will need reworked +define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_range(1,1024) { +; CHECK-LABEL: @small_trip_count_min_vlen_32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2 @@ -35,7 +88,7 @@ ; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -49,7 +102,7 @@ ; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[IV]], 3 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ;