Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -316,7 +316,7 @@ // executing the inner loop will execute the same iterations). This check is // very constrained for now but it will be relaxed in the future. \p Lp is // considered uniform if it meets all the following conditions: -// 1) it has a canonical IV (starting from 0 and with stride 1), +// 1) it has a stride-one IV, // 2) its latch terminator is a conditional branch and, // 3) its latch condition is a compare instruction whose operands are the // canonical IV and an OuterLp invariant. @@ -334,7 +334,7 @@ // before introducing the aforementioned infrastructure. However, if this is not // the case, we should move the \p OuterLp independent checks to a separate // function that is only executed once for each \p Lp. -static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { +static bool isUniformLoop(Loop *Lp, Loop *OuterLp, ScalarEvolution *SE) { assert(Lp->getLoopLatch() && "Expected loop with a single latch."); // If Lp is the outer loop, it's uniform by definition. @@ -343,9 +343,21 @@ assert(OuterLp->contains(Lp) && "OuterLp must contain Lp."); // 1. - PHINode *IV = Lp->getCanonicalInductionVariable(); + PHINode *IV = Lp->getInductionVariable(*SE); if (!IV) { - LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n"); + LLVM_DEBUG(dbgs() << "LV: IV not found.\n"); + return false; + } + const SCEV *IVSCEV = SE->getSCEV(IV); + const SCEVAddRecExpr *AR = dyn_cast(IVSCEV); + if (!AR) { + LLVM_DEBUG(dbgs() << "LV: Bad IV - Not an AddRecExpr SCEV " << *IVSCEV + << "\n"); + return false; + } + const SCEV *Step = AR->getStepRecurrence(*SE); + if (!Step->isOne()) { + LLVM_DEBUG(dbgs() << "LV: Not stride-one IV.\n"); return false; } @@ -379,13 +391,13 @@ // Return true if \p Lp and all its nested loops are uniform with regard to \p // OuterLp. -static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { - if (!isUniformLoop(Lp, OuterLp)) +static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp, ScalarEvolution *SE) { + if (!isUniformLoop(Lp, OuterLp, SE)) return false; // Check if nested loops are uniform. for (Loop *SubLp : *Lp) - if (!isUniformLoopNest(SubLp, OuterLp)) + if (!isUniformLoopNest(SubLp, OuterLp, SE)) return false; return true; @@ -529,8 +541,8 @@ // Check whether inner loops are uniform. At this point, we only support // simple outer loops scenarios with uniform nested loops. - if (!isUniformLoopNest(TheLoop /*loop nest*/, - TheLoop /*context outer loop*/)) { + if (!isUniformLoopNest(TheLoop /*loop nest*/, TheLoop /*context outer loop*/, + PSE.getSE())) { reportVectorizationFailure("Outer loop contains divergent loops", "loop control flow is not understood by vectorizer", "CFGNotUnderstood", ORE, TheLoop); Index: llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll +++ llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll @@ -20,7 +20,7 @@ ; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start. ; CHECK-LABEL: iv_start -; CHECK: LV: Not vectorizing: Outer loop contains divergent loops. +; CHECK: LV: Not vectorizing: Unsupported conditional branch. ; CHECK: LV: Not vectorizing: Unsupported outer loop. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" Index: llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll +++ llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll @@ -1,3 +1,8 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + ; extern int arr[8][8]; ; extern int arr2[8]; ; @@ -12,42 +17,71 @@ ; arr[i2][i1] = i1 + n; ; } ; } -; -; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s | FileCheck %s -; CHECK-LABEL: vector.ph: -; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> poison, i32 %n, i64 0 -; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> poison, <4 x i32> zeroinitializer - -; CHECK-LABEL: vector.body: -; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ] -; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ , %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ] -; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]] -; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> ) -; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32> -; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]] -; CHECK: br label %[[InnerLoop:.+]] - -; CHECK: [[InnerLoop]]: -; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ] -; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]] -; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> %[[InnerPhi]], -; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], -; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0 -; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]] - -; CHECK: [[ForInc]]: -; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4 -; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], -; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 -; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body @arr2 = external global [8 x i32], align 16 @arr = external global [8 x [8 x i32]], align 16 -; Function Attrs: norecurse nounwind uwtable define void @foo(i32 %n) { +; CHECK-LABEL: define void @foo +; CHECK-SAME: (i32 [[N:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC82:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC82]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: br label [[FOR_BODY31:%.*]] +; CHECK: for.body31: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR_BODY31]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP3]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_INC82]], label [[FOR_BODY31]] +; CHECK: for.inc82: +; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; CHECK-NEXT: store i32 [[TMP11]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[N]] +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]] +; CHECK: for.inc8: +; CHECK-NEXT: [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1 +; CHECK-NEXT: [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], 8 +; CHECK-NEXT: br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end10: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -77,6 +111,129 @@ ret void } +; typedef unsigned long long i64; +; extern int a[1024][1024]; +; extern int a2[1024]; +; +; void foo(int n, i64 i1Start, i64 i1End, i64 i2Start, i64 i2End) +; { +; int i1, i2; +; +; #pragma clang loop vectorize(enable) vectorize_width(4) +; for (i1 = i1Start; i1 < i1End; i1++) { +; a2[i1] = i1; +; for (i2 = i2Start; i2 < i2End; i2++) +; a[i2][i1] = i1 + n; +; } +; } + +@a2 = external global [1024 x i32], align 16 +@a = external global [1024 x [1024 x i32]], align 16 + +define void @foo2(i32 %n, i64 %i1Start, i64 %i1End, i64 %i2Start, i64 %i2End) { +; CHECK-LABEL: define void @foo2 +; CHECK-SAME: (i32 [[N:%.*]], i64 [[I1START:%.*]], i64 [[I1END:%.*]], i64 [[I2START:%.*]], i64 [[I2END:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[I2END]], [[I1START]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[I1START]], [[N_VEC]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[I1START]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[I2START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[I1END]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[I2END]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC86:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC86]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: br label [[FOR_BODY31:%.*]] +; CHECK: for.body31: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT3]], [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_BODY31]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x [1024 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP4]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP6]] = add nuw nsw <4 x i64> [[VEC_PHI]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT5]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_INC86]], label [[FOR_BODY31]] +; CHECK: for.inc86: +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[TMP9]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I1START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]] +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; CHECK-NEXT: store i32 [[TMP12]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV21]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[N]] +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[I2START]], [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x [1024 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[I1END]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]] +; CHECK: for.inc8: +; CHECK-NEXT: [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1 +; CHECK-NEXT: [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], [[I2END]] +; CHECK-NEXT: br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: for.end10: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.inc8, %entry + %indvars.iv21 = phi i64 [ %i1Start, %entry ], [ %indvars.iv.next22, %for.inc8 ] + %arrayidx = getelementptr inbounds [1024 x i32], ptr @arr2, i64 0, i64 %indvars.iv21 + %0 = trunc i64 %indvars.iv21 to i32 + store i32 %0, ptr %arrayidx, align 4 + %1 = trunc i64 %indvars.iv21 to i32 + %add = add nsw i32 %1, %n + br label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %indvars.iv = phi i64 [ %i2Start, %for.body ], [ %indvars.iv.next, %for.body3 ] + %arrayidx7 = getelementptr inbounds [1024 x [1024 x i32]], ptr @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21 + store i32 %add, ptr %arrayidx7, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %i1End + br i1 %exitcond, label %for.inc8, label %for.body3 + +for.inc8: ; preds = %for.body3 + %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1 + %exitcond23 = icmp eq i64 %indvars.iv.next22, %i2End + br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1 + +for.end10: ; preds = %for.inc8 + ret void +} + !1 = distinct !{!1, !2, !3} !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.enable", i1 true}