diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1701,6 +1701,11 @@ private: unsigned NumPredStores = 0; + /// Convenience function that returns the value of vscale_range iff + /// vscale_range.min == vscale_range.max or otherwise returns the value + /// returned by the corresponding TLI method. + Optional getVScaleForTuning() const; + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -5600,6 +5605,18 @@ return MaxVF; } +Optional LoopVectorizationCostModel::getVScaleForTuning() const { + if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + auto Max = Attr.getVScaleRangeMax(); + if (Max && Min == Max) + return Max; + } + + return TTI.getVScaleForTuning(); +} + bool LoopVectorizationCostModel::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; @@ -5624,7 +5641,7 @@ // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (Optional VScale = TTI.getVScaleForTuning()) { + if (Optional VScale = getVScaleForTuning()) { if (A.Width.isScalable()) EstimatedWidthA *= VScale.getValue(); if (B.Width.isScalable()) @@ -5673,7 +5690,7 @@ #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; - if (Optional VScale = TTI.getVScaleForTuning()) + if (Optional VScale = getVScaleForTuning()) AssumedMinimumVscale = VScale.getValue(); unsigned Width = Candidate.Width.isScalable() @@ -5885,8 +5902,20 @@ return Result; } + // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know + // the main loop handles 8 lanes per iteration. We could still benefit from + // vectorizing the epilogue loop with VF=4. + ElementCount EstimatedRuntimeVF = MainLoopVF; + if (MainLoopVF.isScalable()) { + EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); + if (Optional VScale = getVScaleForTuning()) + EstimatedRuntimeVF *= VScale.getValue(); + } + for (auto &NextVF : ProfitableVFs) - if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) && + if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && LVP.hasPlanWithVF(NextVF.Width)) Result = NextVF; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -2,24 +2,22 @@ ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 --debug-only=loop-vectorize -force-target-instruction-cost=1 -S 2>%t | FileCheck %s --check-prefix=CHECK ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG -; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK -; RUN: cat %t | FileCheck %s --check-prefix=DEBUG ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-force-VF=8 --debug-only=loop-vectorize -S 2>%t | FileCheck %s --check-prefix=CHECK-VF8 ; RUN: cat %t | FileCheck %s --check-prefix=DEBUG-FORCED target triple = "aarch64-linux-gnu" -; DEBUG: LV: Checking a loop in "f1" +; DEBUG: LV: Checking a loop in "main_vf_vscale_x_16" ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) ; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1 -; DEBUG-FORCED: LV: Checking a loop in "f1" +; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_16" ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) ; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 -define void @f1(i8* %A) #0 { -; CHECK-LABEL: @f1( +define void @main_vf_vscale_x_16(i8* %A) #0 { +; CHECK-LABEL: @main_vf_vscale_x_16( ; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 @@ -105,7 +103,7 @@ ; CHECK: exit: ; CHECK-NEXT: ret void ; -; CHECK-VF8-LABEL: @f1( +; CHECK-VF8-LABEL: @main_vf_vscale_x_16( ; CHECK-VF8-NEXT: iter.check: ; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK-VF8: vector.main.loop.iter.check: @@ -195,4 +193,185 @@ ret void } + +; DEBUG: LV: Checking a loop in "main_vf_vscale_x_2" +; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +; DEBUG-FORCED: LV: Checking a loop in "main_vf_vscale_x_2" +; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. +; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) +; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 + +; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because +; that's the minimum supported VF by SVE), we could still use a wide +; fixed-width VF=8 for the epilogue if the vectors are known to be +; sufficiently wide. This information can be deduced from vscale_range or +; VScaleForTuning (set by mcpu/mtune). +define void @main_vf_vscale_x_2(i64* %A) #0 vscale_range(8, 8) { +; CHECK-LABEL: @main_vf_vscale_x_2( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to * +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer), * [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to * +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer), * [[TMP17]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>* +; CHECK-NEXT: store <8 x i64> , <8 x i64>* [[TMP24]], align 1 +; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 +; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] +; CHECK-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +; CHECK-VF8-LABEL: @main_vf_vscale_x_2( +; CHECK-VF8-NEXT: iter.check: +; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-VF8: vector.main.loop.iter.check: +; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF8: vector.ph: +; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF8: vector.body: +; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP4]] +; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP9]] +; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0 +; CHECK-VF8-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to * +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer), * [[TMP13]], align 1 +; CHECK-VF8-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-VF8-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2 +; CHECK-VF8-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 [[TMP15]] +; CHECK-VF8-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to * +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer), * [[TMP17]], align 1 +; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-VF8-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF8: middle.block: +; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK-VF8: vec.epilog.iter.check: +; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] +; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-VF8: vec.epilog.ph: +; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK-VF8: vec.epilog.vector.body: +; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-VF8-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]] +; CHECK-VF8-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0 +; CHECK-VF8-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>* +; CHECK-VF8-NEXT: store <8 x i64> , <8 x i64>* [[TMP24]], align 1 +; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8 +; CHECK-VF8-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 +; CHECK-VF8-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF8: vec.epilog.middle.block: +; CHECK-VF8-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024 +; CHECK-VF8-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-VF8: vec.epilog.scalar.ph: +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF8: for.body: +; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-VF8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]] +; CHECK-VF8-NEXT: store i64 1, i64* [[ARRAYIDX]], align 1 +; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-VF8-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024 +; CHECK-VF8-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-VF8: exit.loopexit: +; CHECK-VF8-NEXT: br label [[EXIT]] +; CHECK-VF8: exit: +; CHECK-VF8-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %A, i64 %iv + store i64 1, i64* %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, 1024 + br i1 %exitcond, label %for.body, label %exit + +exit: + ret void +} + attributes #0 = { "target-features"="+sve" }