Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -316,7 +316,7 @@
 // executing the inner loop will execute the same iterations). This check is
 // very constrained for now but it will be relaxed in the future. \p Lp is
 // considered uniform if it meets all the following conditions:
-//   1) it has a canonical IV (starting from 0 and with stride 1),
+//   1) it has a stride-one IV,
 //   2) its latch terminator is a conditional branch and,
 //   3) its latch condition is a compare instruction whose operands are the
 //      canonical IV and an OuterLp invariant.
@@ -334,7 +334,7 @@
 // before introducing the aforementioned infrastructure. However, if this is not
 // the case, we should move the \p OuterLp independent checks to a separate
 // function that is only executed once for each \p Lp.
-static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp, ScalarEvolution *SE) {
   assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
 
   // If Lp is the outer loop, it's uniform by definition.
@@ -343,9 +343,21 @@
   assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
 
   // 1.
-  PHINode *IV = Lp->getCanonicalInductionVariable();
+  PHINode *IV = Lp->getInductionVariable(*SE);
   if (!IV) {
-    LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+    LLVM_DEBUG(dbgs() << "LV: IV not found.\n");
+    return false;
+  }
+  const SCEV *IVSCEV = SE->getSCEV(IV);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IVSCEV);
+  if (!AR) {
+    LLVM_DEBUG(dbgs() << "LV: Bad IV - Not an AddRecExpr SCEV " << *IVSCEV
+                      << "\n");
+    return false;
+  }
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+  if (!Step->isOne()) {
+    LLVM_DEBUG(dbgs() << "LV: Not stride-one IV.\n");
     return false;
   }
 
@@ -379,13 +391,13 @@
 
 // Return true if \p Lp and all its nested loops are uniform with regard to \p
 // OuterLp.
-static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
-  if (!isUniformLoop(Lp, OuterLp))
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp, ScalarEvolution *SE) {
+  if (!isUniformLoop(Lp, OuterLp, SE))
     return false;
 
   // Check if nested loops are uniform.
   for (Loop *SubLp : *Lp)
-    if (!isUniformLoopNest(SubLp, OuterLp))
+    if (!isUniformLoopNest(SubLp, OuterLp, SE))
       return false;
 
   return true;
@@ -529,8 +541,8 @@
 
   // Check whether inner loops are uniform. At this point, we only support
   // simple outer loops scenarios with uniform nested loops.
-  if (!isUniformLoopNest(TheLoop /*loop nest*/,
-                         TheLoop /*context outer loop*/)) {
+  if (!isUniformLoopNest(TheLoop /*loop nest*/, TheLoop /*context outer loop*/,
+                         PSE.getSE())) {
     reportVectorizationFailure("Outer loop contains divergent loops",
         "loop control flow is not understood by vectorizer",
         "CFGNotUnderstood", ORE, TheLoop);
Index: llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
+++ llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
@@ -20,7 +20,7 @@
 ; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start.
 
 ; CHECK-LABEL: iv_start
-; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported conditional branch.
 ; CHECK: LV: Not vectorizing: Unsupported outer loop.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
Index: llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
+++ llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -1,3 +1,8 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
 ; extern int arr[8][8];
 ; extern int arr2[8];
 ;
@@ -12,42 +17,71 @@
 ;       arr[i2][i1] = i1 + n;
 ;   }
 ; }
-;
-; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
-; CHECK-LABEL: vector.ph:
-; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> poison, i32 %n, i64 0
-; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> poison, <4 x i32> zeroinitializer
-
-; CHECK-LABEL: vector.body:
-; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
-; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
-; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> %[[VecInd]]
-; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[VecIndTr]], <4 x ptr> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
-; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
-; CHECK: br label %[[InnerLoop:.+]]
-
-; CHECK: [[InnerLoop]]:
-; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ zeroinitializer, %vector.body ], [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ]
-; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
-; CHECK: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %[[StoreVal]], <4 x ptr> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
-; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
-; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
-; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
-
-; CHECK: [[ForInc]]:
-; CHECK: %[[IndNext]] = add nuw i64 %[[Ind]], 4
-; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
-; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
 
 @arr2 = external global [8 x i32], align 16
 @arr = external global [8 x [8 x i32]], align 16
 
-; Function Attrs: norecurse nounwind uwtable
 define void @foo(i32 %n) {
+; CHECK-LABEL: define void @foo
+; CHECK-SAME: (i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC82:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC82]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP1]], <4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label [[FOR_BODY31:%.*]]
+; CHECK:       for.body31:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP5:%.*]], [[FOR_BODY31]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP3]], <4 x ptr> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP5]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_INC82]], label [[FOR_BODY31]]
+; CHECK:       for.inc82:
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; CHECK-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[N]]
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [8 x [8 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]]
+; CHECK:       for.inc8:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1
+; CHECK-NEXT:    [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end10:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -77,6 +111,129 @@
   ret void
 }
 
+; typedef unsigned long long i64;
+; extern int a[1024][1024];
+; extern int a2[1024];
+;
+; void foo(int n, i64 i1Start, i64 i1End, i64 i2Start, i64 i2End)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i1 = i1Start; i1 < i1End; i1++) {
+;     a2[i1] = i1;
+;     for (i2 = i2Start; i2 < i2End; i2++)
+;       a[i2][i1] = i1 + n;
+;   }
+; }
+
+@a2 = external global [1024 x i32], align 16
+@a = external global [1024 x [1024 x i32]], align 16
+
+define void @foo2(i32 %n, i64 %i1Start, i64 %i1End, i64 %i2Start, i64 %i2End) {
+; CHECK-LABEL: define void @foo2
+; CHECK-SAME: (i32 [[N:%.*]], i64 [[I1START:%.*]], i64 [[I1END:%.*]], i64 [[I2START:%.*]], i64 [[I2END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[I2END]], [[I1START]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[I1START]], [[N_VEC]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[I1START]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[N]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i64> poison, i64 [[I2START]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT2]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i64> poison, i64 [[I1END]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT4]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[I2END]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC86:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_INC86]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], ptr @arr2, i64 0, <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP2]], <4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    br label [[FOR_BODY31:%.*]]
+; CHECK:       for.body31:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ [[BROADCAST_SPLAT3]], [[VECTOR_BODY]] ], [ [[TMP6:%.*]], [[FOR_BODY31]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1024 x [1024 x i32]], ptr @arr, i64 0, <4 x i64> [[VEC_PHI]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP4]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[FOR_INC86]], label [[FOR_BODY31]]
+; CHECK:       for.inc86:
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i64> [[TMP9]], [[BROADCAST_SPLAT8]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I1START]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT22:%.*]], [[FOR_INC8:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @arr2, i64 0, i64 [[INDVARS_IV21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; CHECK-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV21]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[N]]
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[I2START]], [[FOR_BODY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [1024 x [1024 x i32]], ptr @arr, i64 0, i64 [[INDVARS_IV]], i64 [[INDVARS_IV21]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[I1END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_INC8]], label [[FOR_BODY3]]
+; CHECK:       for.inc8:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT22]] = add nuw nsw i64 [[INDVARS_IV21]], 1
+; CHECK-NEXT:    [[EXITCOND23:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT22]], [[I2END]]
+; CHECK-NEXT:    br i1 [[EXITCOND23]], label [[FOR_END10]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end10:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ %i1Start, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, ptr %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ %i2Start, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [1024 x [1024 x i32]], ptr @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, ptr %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %i1End
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, %i2End
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
 !1 = distinct !{!1, !2, !3}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}