diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2247,6 +2247,23 @@ collectSupportedLoops(*InnerL, LI, ORE, V); } +static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, + LoopVectorizationLegality &LVL, + LoopVectorizationPlanner &LVP, + VectorizationFactor VF, unsigned IC) { + LLVM_DEBUG(debugVectorizationMessage( + "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop", + nullptr)); + StringRef LoopType = TheLoop->isInnermost() ? "" : "outer"; + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(), + TheLoop->getHeader()) + << "vectorized " << LoopType << "loop (vectorization width: " + << ore::NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")"; + }); +} + //===----------------------------------------------------------------------===// // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and // LoopVectorizationCostModel and LoopVectorizationPlanner. @@ -7510,8 +7527,22 @@ VectorizationFactor LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { - assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; + if (UserVF.isScalable() && !TTI->supportsScalableVectors() && + !ForceTargetSupportsScalableVectors) { + LLVM_DEBUG(dbgs() << "LV: User VF=" << VF + << " is ignored because scalable vectors are not " + "available.\n"); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor", + OrigLoop->getStartLoc(), + OrigLoop->getHeader()) + << "Cannot vectorize outer loop with User-specified vectorization " + << "factor " << ore::NV("UserVectorizationFactor", VF) + << " as the target does not support scalable vectors."; + }); + return VectorizationFactor::Disabled(); + } // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -9922,6 +9953,8 @@ LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); } + reportVectorization(ORE, L, *LVL, LVP, VF, 1); + // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); @@ -10473,13 +10506,7 @@ DisableRuntimeUnroll = true; } // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + reportVectorization(ORE, L, LVL, LVP, VF, IC); } if (ORE->allowExtraAnalysis(LV_NAME)) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -120,7 +120,7 @@ // Get or create a region for the loop containing BB. Loop *CurrentLoop = LI->getLoopFor(BB); VPRegionBlock *ParentR = nullptr; - if (CurrentLoop) { + if (CurrentLoop && CurrentLoop->getLoopDepth() >= TheLoop->getLoopDepth()) { auto Iter = Loop2Region.insert({CurrentLoop, nullptr}); if (Iter.second) Iter.first->second = new VPRegionBlock( diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/RISCV/outer_loop_scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/outer_loop_scalable.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v -passes=loop-vectorize -enable-vplan-native-path %s -pass-remarks=loop-vectorize -S 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARK + +; void test(int n, int **a) +; { +; #pragma clang loop vectorize_width(4, scalable) +; for (int i = 0; i < n; ++i) { +; for (int j = 0; j < n; ++j) { +; a[i][j] = 2; +; } +; } +; } + +; CHECK-REMARK: remark: :0:0: vectorized outerloop (vectorization width: vscale x 4, interleaved count: 1) + +define dso_local void @test(i32 %0, ptr %1) { +; CHECK-LABEL: define dso_local void @test +; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP33:%.*]] +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP7]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = mul [[TMP11]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 1, [[TMP14]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP15]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[TMP22:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP22]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4p0.nxv4p0( [[TMP16]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: br label [[TMP17:%.*]] +; CHECK: 17: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP19:%.*]], [[TMP17]] ] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, [[WIDE_MASKED_GATHER]], [[VEC_PHI]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer), [[TMP18]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP19]] = add nuw nsw [[VEC_PHI]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq [[TMP19]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i32 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[TMP22]], label [[TMP17]] +; CHECK: 22: +; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq [[TMP23]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[TMP32:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TMP4]] ] +; CHECK-NEXT: br label [[TMP28:%.*]] +; CHECK: 28: +; CHECK-NEXT: [[TMP29:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP35:%.*]], [[TMP34:%.*]] ] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8 +; CHECK-NEXT: br label [[TMP37:%.*]] +; CHECK: 32: +; CHECK-NEXT: br label [[TMP33]] +; CHECK: 33: +; CHECK-NEXT: ret void +; CHECK: 34: +; CHECK-NEXT: [[TMP35]] = add nuw nsw i64 [[TMP29]], 1 +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[TMP35]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP36]], label [[TMP32]], label [[TMP28]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: 37: +; CHECK-NEXT: [[TMP38:%.*]] = phi i64 [ 0, [[TMP28]] ], [ [[TMP40:%.*]], [[TMP37]] ] +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP38]] +; CHECK-NEXT: store i32 2, ptr [[TMP39]], align 4 +; CHECK-NEXT: [[TMP40]] = add nuw nsw i64 [[TMP38]], 1 +; CHECK-NEXT: [[TMP41:%.*]] = icmp eq i64 [[TMP40]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP41]], label [[TMP34]], label [[TMP37]] +; + %3 = icmp sgt i32 %0, 0 + br i1 %3, label %4, label %11 + +4: + %5 = zext i32 %0 to i64 + br label %6 + +6: + %7 = phi i64 [ 0, %4 ], [ %13, %12 ] + %8 = getelementptr inbounds ptr, ptr %1, i64 %7 + %9 = load ptr, ptr %8, align 8 + br label %15 + +10: + br label %11 + +11: + ret void + +12: + %13 = add nuw nsw i64 %7, 1 + %14 = icmp eq i64 %13, %5 + br i1 %14, label %10, label %6, !llvm.loop !9 + +15: + %16 = phi i64 [ 0, %6 ], [ %18, %15 ] + %17 = getelementptr inbounds i32, ptr %9, i64 %16 + store i32 2, ptr %17, align 4 + %18 = add nuw nsw i64 %16, 1 + %19 = icmp eq i64 %18, %5 + br i1 %19, label %12, label %15 +} + +!9 = distinct !{!9, !10, !11, !12, !13} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.vectorize.width", i32 4} +!12 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!13 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_no_scalable.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_no_scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_no_scalable.ll @@ -0,0 +1,53 @@ +; RUN: opt -passes=loop-vectorize -enable-vplan-native-path %s -pass-remarks-analysis=loop-vectorize -disable-output 2>&1 | FileCheck %s + +; CHECK: Cannot vectorize outer loop with User-specified vectorization factor vscale x 4 as the target does not support scalable vectors + +; void test(int n, int **a) +; { +; #pragma clang loop vectorize_width(4, scalable) +; for (int i = 0; i < n; ++i) { +; for (int j = 0; j < n; ++j) { +; a[i][j] = 2; +; } +; } +; } + +define dso_local void @test(i32 %0, ptr %1) { + %3 = icmp sgt i32 %0, 0 + br i1 %3, label %4, label %11 + +4: + %5 = zext i32 %0 to i64 + br label %6 + +6: + %7 = phi i64 [ 0, %4 ], [ %13, %12 ] + %8 = getelementptr inbounds ptr, ptr %1, i64 %7 + %9 = load ptr, ptr %8, align 8 + br label %15 + +10: + br label %11 + +11: + ret void + +12: + %13 = add nuw nsw i64 %7, 1 + %14 = icmp eq i64 %13, %5 + br i1 %14, label %10, label %6, !llvm.loop !9 + +15: + %16 = phi i64 [ 0, %6 ], [ %18, %15 ] + %17 = getelementptr inbounds i32, ptr %9, i64 %16 + store i32 2, ptr %17, align 4 + %18 = add nuw nsw i64 %16, 1 + %19 = icmp eq i64 %18, %5 + br i1 %19, label %12, label %15 +} + +!9 = distinct !{!9, !10, !11, !12, !13} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.vectorize.width", i32 4} +!12 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!13 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test3.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test3.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s -S -pass-remarks=loop-vectorize 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARK + +; void test(int n, int **a) +; { +; for (int k = 0; k < n; ++k) { +; a[k][0] = 0; +; #pragma clang loop vectorize_width(4) +; for (int i = 0; i < n; ++i) { +; for (int j = 0; j < n; ++j) { +; a[i][j] = 2 + k; +; } +; } +; } +; } + +; CHECK-REMARK: remark: :0:0: vectorized outerloop (vectorization width: 4, interleaved count: 1) + +define void @test(i32 %0, ptr %1) { +; CHECK-LABEL: define void @test +; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]] +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: br label [[TMP8:%.*]] +; CHECK: 6: +; CHECK-NEXT: br label [[TMP7]] +; CHECK: 7: +; CHECK-NEXT: ret void +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = phi i64 [ 0, [[TMP4]] ], [ [[TMP29:%.*]], [[TMP28:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8 +; CHECK-NEXT: store i32 0, ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP9]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[TMP20:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP20]] ] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP14]], i32 8, <4 x i1> , <4 x ptr> poison) +; CHECK-NEXT: br label [[TMP15:%.*]] +; CHECK: 15: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP17:%.*]], [[TMP15]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP16]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP17]] = add nuw nsw <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <4 x i64> [[TMP17]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i32 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[TMP20]], label [[TMP15]] +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq <4 x i64> [[TMP21]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[TMP28]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TMP8]] ] +; CHECK-NEXT: br label [[TMP24:%.*]] +; CHECK: 24: +; CHECK-NEXT: [[TMP25:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP32:%.*]], [[TMP31:%.*]] ] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[TMP26]], align 8 +; CHECK-NEXT: br label [[TMP34:%.*]] +; CHECK: 28: +; CHECK-NEXT: [[TMP29]] = add nuw nsw i64 [[TMP9]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[TMP29]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP30]], label [[TMP6:%.*]], label [[TMP8]] +; CHECK: 31: +; CHECK-NEXT: [[TMP32]] = add nuw nsw i64 [[TMP25]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[TMP32]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP33]], label [[TMP28]], label [[TMP24]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: 34: +; CHECK-NEXT: [[TMP35:%.*]] = phi i64 [ 0, [[TMP24]] ], [ [[TMP37:%.*]], [[TMP34]] ] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP35]] +; CHECK-NEXT: store i32 [[TMP13]], ptr [[TMP36]], align 4 +; CHECK-NEXT: [[TMP37]] = add nuw nsw i64 [[TMP35]], 1 +; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[TMP37]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP38]], label [[TMP31]], label [[TMP34]] +; + %3 = icmp sgt i32 %0, 0 + br i1 %3, label %4, label %7 + +4: + %5 = zext i32 %0 to i64 + br label %8 + +6: + br label %7 + +7: + ret void + +8: + %9 = phi i64 [ 0, %4 ], [ %19, %18 ] + %10 = getelementptr inbounds ptr, ptr %1, i64 %9 + %11 = load ptr, ptr %10, align 8 + store i32 0, ptr %11, align 4 + %12 = trunc i64 %9 to i32 + %13 = add i32 %12, 2 + br label %14 + +14: + %15 = phi i64 [ 0, %8 ], [ %22, %21 ] + %16 = getelementptr inbounds ptr, ptr %1, i64 %15 + %17 = load ptr, ptr %16, align 8 + br label %24 + +18: + %19 = add nuw nsw i64 %9, 1 + %20 = icmp eq i64 %19, %5 + br i1 %20, label %6, label %8 + +21: + %22 = add nuw nsw i64 %15, 1 + %23 = icmp eq i64 %22, %5 + br i1 %23, label %18, label %14, !llvm.loop !13 + +24: + %25 = phi i64 [ 0, %14 ], [ %27, %24 ] + %26 = getelementptr inbounds i32, ptr %17, i64 %25 + store i32 %13, ptr %26, align 4 + %27 = add nuw nsw i64 %25, 1 + %28 = icmp eq i64 %27, %5 + br i1 %28, label %21, label %24 +} + +!13 = distinct !{!13, !14, !15, !16} +!14 = !{!"llvm.loop.vectorize.width", i32 4} +!15 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +!16 = !{!"llvm.loop.vectorize.enable", i1 true}