diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2247,6 +2247,23 @@
     collectSupportedLoops(*InnerL, LI, ORE, V);
 }
 
+static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+                                LoopVectorizationLegality &LVL,
+                                LoopVectorizationPlanner &LVP,
+                                VectorizationFactor VF, unsigned IC) {
+  LLVM_DEBUG(debugVectorizationMessage(
+      "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
+      nullptr));
+  StringRef LoopType = TheLoop->isInnermost() ? "" : "outer";
+  ORE->emit([&]() {
+    return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
+                              TheLoop->getHeader())
+           << "vectorized " << LoopType << "loop (vectorization width: "
+           << ore::NV("VectorizationFactor", VF.Width)
+           << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
+  });
+}
+
 //===----------------------------------------------------------------------===//
 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
 // LoopVectorizationCostModel and LoopVectorizationPlanner.
@@ -7510,8 +7527,22 @@
 
 VectorizationFactor
 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
-  assert(!UserVF.isScalable() && "scalable vectors not yet supported");
   ElementCount VF = UserVF;
+  if (UserVF.isScalable() && !TTI->supportsScalableVectors() &&
+      !ForceTargetSupportsScalableVectors) {
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << VF
+                      << " is ignored because scalable vectors are not "
+                         "available.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                        OrigLoop->getStartLoc(),
+                                        OrigLoop->getHeader())
+             << "Cannot vectorize outer loop with User-specified vectorization "
+             << "factor " << ore::NV("UserVectorizationFactor", VF)
+             << " as the target does not support scalable vectors.";
+    });
+    return VectorizationFactor::Disabled();
+  }
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
@@ -9922,6 +9953,8 @@
     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
   }
 
+  reportVectorization(ORE, L, *LVL, LVP, VF, 1);
+
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
@@ -10473,13 +10506,7 @@
           DisableRuntimeUnroll = true;
       }
       // Report the vectorization decision.
-      ORE->emit([&]() {
-        return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
-                                  L->getHeader())
-               << "vectorized loop (vectorization width: "
-               << NV("VectorizationFactor", VF.Width)
-               << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
-      });
+      reportVectorization(ORE, L, LVL, LVP, VF, IC);
     }
 
     if (ORE->allowExtraAnalysis(LV_NAME))
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -120,7 +120,7 @@
   // Get or create a region for the loop containing BB.
   Loop *CurrentLoop = LI->getLoopFor(BB);
   VPRegionBlock *ParentR = nullptr;
-  if (CurrentLoop) {
+  if (CurrentLoop && CurrentLoop->getLoopDepth() >= TheLoop->getLoopDepth()) {
     auto Iter = Loop2Region.insert({CurrentLoop, nullptr});
     if (Iter.second)
       Iter.first->second = new VPRegionBlock(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/RISCV/outer_loop_scalable.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/outer_loop_scalable.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v -passes=loop-vectorize -enable-vplan-native-path %s -pass-remarks=loop-vectorize -S 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARK
+
+; void test(int n, int **a)
+; {
+;     #pragma clang loop vectorize_width(4, scalable)
+;     for (int i = 0; i < n; ++i) {
+;         for (int j = 0; j < n; ++j) {
+;             a[i][j] = 2;
+;         }
+;     }
+; }
+
+; CHECK-REMARK: remark: <unknown>:0:0: vectorized outerloop (vectorization width: vscale x 4, interleaved count: 1)
+
+define dso_local void @test(i32 %0, ptr %1) {
+; CHECK-LABEL: define dso_local void @test
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP33:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP11]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 1, [[TMP14]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP15]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[TMP22:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP22]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x ptr> @llvm.masked.gather.nxv4p0.nxv4p0(<vscale x 4 x ptr> [[TMP16]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x ptr> poison)
+; CHECK-NEXT:    br label [[TMP17:%.*]]
+; CHECK:       17:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP19:%.*]], [[TMP17]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, <vscale x 4 x ptr> [[WIDE_MASKED_GATHER]], <vscale x 4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19]] = add nuw nsw <vscale x 4 x i64> [[VEC_PHI]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq <vscale x 4 x i64> [[TMP19]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-NEXT:    br i1 [[TMP21]], label [[TMP22]], label [[TMP17]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <vscale x 4 x i64> [[TMP23]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[TMP32:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TMP4]] ]
+; CHECK-NEXT:    br label [[TMP28:%.*]]
+; CHECK:       28:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP35:%.*]], [[TMP34:%.*]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    br label [[TMP37:%.*]]
+; CHECK:       32:
+; CHECK-NEXT:    br label [[TMP33]]
+; CHECK:       33:
+; CHECK-NEXT:    ret void
+; CHECK:       34:
+; CHECK-NEXT:    [[TMP35]] = add nuw nsw i64 [[TMP29]], 1
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[TMP35]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[TMP32]], label [[TMP28]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       37:
+; CHECK-NEXT:    [[TMP38:%.*]] = phi i64 [ 0, [[TMP28]] ], [ [[TMP40:%.*]], [[TMP37]] ]
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[TMP31]], i64 [[TMP38]]
+; CHECK-NEXT:    store i32 2, ptr [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP40]] = add nuw nsw i64 [[TMP38]], 1
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[TMP40]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP41]], label [[TMP34]], label [[TMP37]]
+;
+  %3 = icmp sgt i32 %0, 0
+  br i1 %3, label %4, label %11
+
+4:
+  %5 = zext i32 %0 to i64
+  br label %6
+
+6:
+  %7 = phi i64 [ 0, %4 ], [ %13, %12 ]
+  %8 = getelementptr inbounds ptr, ptr %1, i64 %7
+  %9 = load ptr, ptr %8, align 8
+  br label %15
+
+10:
+  br label %11
+
+11:
+  ret void
+
+12:
+  %13 = add nuw nsw i64 %7, 1
+  %14 = icmp eq i64 %13, %5
+  br i1 %14, label %10, label %6, !llvm.loop !9
+
+15:
+  %16 = phi i64 [ 0, %6 ], [ %18, %15 ]
+  %17 = getelementptr inbounds i32, ptr %9, i64 %16
+  store i32 2, ptr %17, align 4
+  %18 = add nuw nsw i64 %16, 1
+  %19 = icmp eq i64 %18, %5
+  br i1 %19, label %12, label %15
+}
+
+!9 = distinct !{!9, !10, !11, !12, !13}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = !{!"llvm.loop.vectorize.width", i32 4}
+!12 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_no_scalable.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_no_scalable.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_no_scalable.ll
@@ -0,0 +1,53 @@
+; RUN: opt -passes=loop-vectorize -enable-vplan-native-path %s -pass-remarks-analysis=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+; CHECK: Cannot vectorize outer loop with User-specified vectorization factor vscale x 4 as the target does not support scalable vectors
+
+; void test(int n, int **a)
+; {
+;     #pragma clang loop vectorize_width(4, scalable)
+;     for (int i = 0; i < n; ++i) {
+;         for (int j = 0; j < n; ++j) {
+;             a[i][j] = 2;
+;         }
+;     }
+; }
+
+define dso_local void @test(i32 %0, ptr %1) {
+  %3 = icmp sgt i32 %0, 0
+  br i1 %3, label %4, label %11
+
+4:
+  %5 = zext i32 %0 to i64
+  br label %6
+
+6:
+  %7 = phi i64 [ 0, %4 ], [ %13, %12 ]
+  %8 = getelementptr inbounds ptr, ptr %1, i64 %7
+  %9 = load ptr, ptr %8, align 8
+  br label %15
+
+10:
+  br label %11
+
+11:
+  ret void
+
+12:
+  %13 = add nuw nsw i64 %7, 1
+  %14 = icmp eq i64 %13, %5
+  br i1 %14, label %10, label %6, !llvm.loop !9
+
+15:
+  %16 = phi i64 [ 0, %6 ], [ %18, %15 ]
+  %17 = getelementptr inbounds i32, ptr %9, i64 %16
+  store i32 2, ptr %17, align 4
+  %18 = add nuw nsw i64 %16, 1
+  %19 = icmp eq i64 %18, %5
+  br i1 %19, label %12, label %15
+}
+
+!9 = distinct !{!9, !10, !11, !12, !13}
+!10 = !{!"llvm.loop.mustprogress"}
+!11 = !{!"llvm.loop.vectorize.width", i32 4}
+!12 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test3.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test3.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test3.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -passes=loop-vectorize -enable-vplan-native-path < %s -S -pass-remarks=loop-vectorize 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARK
+
+; void test(int n, int **a)
+; {
+;   for (int k = 0; k < n; ++k) {
+;     a[k][0] = 0;
+;     #pragma clang loop vectorize_width(4)
+;     for (int i = 0; i < n; ++i) {
+;         for (int j = 0; j < n; ++j) {
+;             a[i][j] = 2 + k;
+;         }
+;     }
+;   }
+; }
+
+; CHECK-REMARK: remark: <unknown>:0:0: vectorized outerloop (vectorization width: 4, interleaved count: 1)
+
+define void @test(i32 %0, ptr %1) {
+; CHECK-LABEL: define void @test
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    br label [[TMP8:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    br label [[TMP7]]
+; CHECK:       7:
+; CHECK-NEXT:    ret void
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, [[TMP4]] ], [ [[TMP29:%.*]], [[TMP28:%.*]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
+; CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP13]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[TMP20:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP20]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> [[TMP14]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x ptr> poison)
+; CHECK-NEXT:    br label [[TMP15:%.*]]
+; CHECK:       15:
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP17:%.*]], [[TMP15]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, <4 x ptr> [[WIDE_MASKED_GATHER]], <4 x i64> [[VEC_PHI]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[BROADCAST_SPLAT]], <4 x ptr> [[TMP16]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP17]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq <4 x i64> [[TMP17]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP18]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label [[TMP20]], label [[TMP15]]
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq <4 x i64> [[TMP21]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[TMP28]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TMP8]] ]
+; CHECK-NEXT:    br label [[TMP24:%.*]]
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP32:%.*]], [[TMP31:%.*]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds ptr, ptr [[TMP1]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[TMP26]], align 8
+; CHECK-NEXT:    br label [[TMP34:%.*]]
+; CHECK:       28:
+; CHECK-NEXT:    [[TMP29]] = add nuw nsw i64 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP30]], label [[TMP6:%.*]], label [[TMP8]]
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32]] = add nuw nsw i64 [[TMP25]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i64 [[TMP32]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP33]], label [[TMP28]], label [[TMP24]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       34:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi i64 [ 0, [[TMP24]] ], [ [[TMP37:%.*]], [[TMP34]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 [[TMP35]]
+; CHECK-NEXT:    store i32 [[TMP13]], ptr [[TMP36]], align 4
+; CHECK-NEXT:    [[TMP37]] = add nuw nsw i64 [[TMP35]], 1
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP38]], label [[TMP31]], label [[TMP34]]
+;
+  %3 = icmp sgt i32 %0, 0
+  br i1 %3, label %4, label %7
+
+4:
+  %5 = zext i32 %0 to i64
+  br label %8
+
+6:
+  br label %7
+
+7:
+  ret void
+
+8:
+  %9 = phi i64 [ 0, %4 ], [ %19, %18 ]
+  %10 = getelementptr inbounds ptr, ptr %1, i64 %9
+  %11 = load ptr, ptr %10, align 8
+  store i32 0, ptr %11, align 4
+  %12 = trunc i64 %9 to i32
+  %13 = add i32 %12, 2
+  br label %14
+
+14:
+  %15 = phi i64 [ 0, %8 ], [ %22, %21 ]
+  %16 = getelementptr inbounds ptr, ptr %1, i64 %15
+  %17 = load ptr, ptr %16, align 8
+  br label %24
+
+18:
+  %19 = add nuw nsw i64 %9, 1
+  %20 = icmp eq i64 %19, %5
+  br i1 %20, label %6, label %8
+
+21:
+  %22 = add nuw nsw i64 %15, 1
+  %23 = icmp eq i64 %22, %5
+  br i1 %23, label %18, label %14, !llvm.loop !13
+
+24:
+  %25 = phi i64 [ 0, %14 ], [ %27, %24 ]
+  %26 = getelementptr inbounds i32, ptr %17, i64 %25
+  store i32 %13, ptr %26, align 4
+  %27 = add nuw nsw i64 %25, 1
+  %28 = icmp eq i64 %27, %5
+  br i1 %28, label %21, label %24
+}
+
+!13 = distinct !{!13, !14, !15, !16}
+!14 = !{!"llvm.loop.vectorize.width", i32 4}
+!15 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!16 = !{!"llvm.loop.vectorize.enable", i1 true}