Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -497,6 +497,14 @@
                             const VPIteration &Instance, bool IfPredicateInstr,
                             VPTransformState &State);
 
+  /// Same as above, except that the lane comes from a runtime value, and the
+  /// cloned instruction is returned instead of being directly stored into
+  /// the transform state.
+  Instruction *
+  scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
+                       unsigned Part, Value *Lane, bool IfPredicateInstr,
+                       VPTransformState &State);
+
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
                                  VPTransformState &State);
@@ -2779,6 +2787,70 @@
     PredicatedInstructions.push_back(Cloned);
 }
 
+
+Instruction *
+InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+                                          VPReplicateRecipe *RepRecipe,
+                                          unsigned Part, Value *Lane,
+                                          bool IfPredicateInstr,
+                                          VPTransformState &State) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+
+  // llvm.experimental.noalias.scope.decl intrinsics can just be dropped
+  // TODO: add special first lane handling
+  if (isa<NoAliasScopeDeclInst>(Instr))
+    return nullptr;
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  Instruction *Cloned = Instr->clone();
+  if (!IsVoidRetTy)
+    Cloned->setName(Instr->getName() + ".cloned");
+
+  // If the scalarized instruction contributes to the address computation of a
+  // widen masked load/store which was in a basic block that needed predication
+  // and is not predicated after vectorization, we can't propagate
+  // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
+  // instruction could feed a poison value to the base address of the widen
+  // load/store.
+  if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
+    Cloned->dropPoisonGeneratingFlags();
+
+  if (Instr->getDebugLoc())
+    State.setDebugLocFromInst(Instr);
+
+  // Replace the operands of the cloned instructions with their scalar
+  // equivalents in the new loop.
+  for (auto &I : enumerate(RepRecipe->operands())) {
+    VPValue *Operand = I.value();
+    if (VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand))
+      if (OperandR->isUniform()) {
+        VPIteration First = {Part, VPLane::getFirstLane()};
+        Cloned->setOperand(I.index(), State.get(Operand, First));
+        continue;
+      }
+    auto *VecPart = State.get(Operand, Part);
+    auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
+    Cloned->setOperand(I.index(), Extract);
+  }
+  State.addNewMetadata(Cloned, Instr);
+
+  // Place the cloned scalar in the new loop.
+  State.Builder.Insert(Cloned);
+
+  // If we just cloned a new assumption, add it the assumption cache.
+  if (auto *II = dyn_cast<AssumeInst>(Cloned))
+    AC->registerAssumption(II);
+
+  // End if-block.
+  if (IfPredicateInstr)
+    PredicatedInstructions.push_back(Cloned);
+
+  return Cloned;
+}
+
+
 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
   if (TripCount)
     return TripCount;
@@ -6791,6 +6863,13 @@
             // Scalarization of fixed length vectors "just works".
             return true;
 
+          // If not predicated, we can now scalarize generically with a loop
+          // if needed.  The remainder of the code below is about checking
+          // for cases we can scalarize with predication without hitting
+          // the generic replicate path which isn't yet implemented.
+          if (!foldTailByMasking())
+            return true;
+
           // For scalable vectors, a uniform memop load is always
           // uniform-by-parts  and we know how to scalarize that.
           if (isa<LoadInst>(I))
@@ -9597,6 +9676,67 @@
     return;
   }
 
+  if (State.VF.isScalable()) {
+    // For scalable vectors, we can scalarize by using an inner loop to
+    // execute the statically unknown number of iterations required.
+    // TODO: This strategy could be used for long fixed length vectors if
+    // profitable.
+    // TODO: Instead of one sub-loop per part, we could use one loop
+    // processing lanes of each unrolled copy at once.
+    auto *Instr = getUnderlyingInstr();
+
+    auto &Builder = State.Builder;
+    Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+
+      auto *InsertPt = &*Builder.GetInsertPoint();
+      BasicBlock *PreheaderBB = InsertPt->getParent();
+      BasicBlock *HeaderBB = SplitBlock(InsertPt->getParent(), InsertPt);
+      BasicBlock *ExitBB = SplitBlock(InsertPt->getParent(), InsertPt);
+
+      HeaderBB->getTerminator()->eraseFromParent();
+      Builder.SetInsertPoint(HeaderBB);
+      auto *IV = Builder.CreatePHI(RunTimeVF->getType(), 2);
+      IV->addIncoming(ConstantInt::get(RunTimeVF->getType(), 0), PreheaderBB);
+      PHINode *ResultIV = nullptr;
+      if (!Instr->getType()->isVoidTy()) {
+        auto *ResultTy = VectorType::get(Instr->getType(), State.VF);
+        ResultIV = Builder.CreatePHI(ResultTy, 2);
+        ResultIV->addIncoming(PoisonValue::get(ResultTy), PreheaderBB);
+      }
+
+      Instruction *Cloned =
+        State.ILV->scalarizeInstruction(Instr, this, Part, IV, IsPredicated,
+                                        State);
+
+      if (ResultIV) {
+        auto *Insert = Builder.CreateInsertElement(ResultIV, Cloned, IV);
+        ResultIV->addIncoming(Insert, HeaderBB);
+        State.set(this, Insert, Part);
+      }
+
+      auto *Inc = Builder.CreateAdd(IV, ConstantInt::get(RunTimeVF->getType(), 1));
+      IV->addIncoming(Inc, HeaderBB);
+      auto *Cmp = Builder.CreateICmpNE(IV, RunTimeVF);
+      Builder.CreateCondBr(Cmp, HeaderBB, ExitBB);
+
+      // Update the state so that we can continue at the right point for the next
+      // recipe, and have valid analysis results once done transforming.
+      assert(State.CFG.VPBB2IRBB[getParent()].back() == PreheaderBB);
+      State.CFG.VPBB2IRBB[getParent()].push_back(HeaderBB);
+      State.CFG.VPBB2IRBB[getParent()].push_back(ExitBB);
+      State.CFG.PrevBB = ExitBB;
+
+      assert(State.CurrentVectorLoop);
+      State.CurrentVectorLoop->addBasicBlockToLoop(HeaderBB, *State.LI);
+      State.CurrentVectorLoop->addBasicBlockToLoop(ExitBB, *State.LI);
+
+      Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+    }
+    return;
+  }
+
   // Generate scalar instances for all VF lanes of all UF parts.
   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
   const unsigned EndLane = State.VF.getKnownMinValue();
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -797,6 +797,7 @@
 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
                                 BasicBlock *LoopLatchBB,
                                 BasicBlock *LoopExitBB) {
+  //LoopHeaderBB->getParent()->dump();
   // The vector body may be more than a single basic-block by this point.
   // Update the dominator tree information inside the vector body by propagating
   // it from header to latch, expecting only triangular control-flow, if any.
@@ -808,8 +809,13 @@
            "Basic block in vector loop has more than 2 successors.");
     PostDomSucc = Succs[0];
     if (Succs.size() == 1) {
-      assert(PostDomSucc->getSinglePredecessor() &&
-             "PostDom successor has more than one predecessor.");
+      DT->addNewBlock(PostDomSucc, BB);
+      continue;
+    }
+    if (Succs[0] == BB || Succs[1] == BB) {
+      // simple one block loop - i.e. scalable scalarization
+      if (Succs[0] == BB)
+        PostDomSucc = Succs[1];
       DT->addNewBlock(PostDomSucc, BB);
       continue;
     }
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -19,18 +19,29 @@
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_SPLIT_SPLIT:%.*]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16*> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP8]], 4
+; CHECK-NEXT:    br label [[VECTOR_BODY_SPLIT:%.*]]
+; CHECK:       vector.body.split:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ 0, [[VECTOR_BODY]] ], [ [[TMP13:%.*]], [[VECTOR_BODY_SPLIT]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x i16*> [[BROADCAST_SPLAT]], i32 [[TMP10]]
+; CHECK-NEXT:    store i16 [[TMP11]], i16* [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP13]] = add i32 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY_SPLIT]], label [[VECTOR_BODY_SPLIT_SPLIT]]
+; CHECK:       vector.body.split.split:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]]
Index: llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -853,15 +853,59 @@
 define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
 ; SCALABLE-LABEL: @uniform_store_of_loop_varying(
 ; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x ptr> poison, ptr [[B:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 1 x ptr> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[V:%.*]], i32 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SCALABLE:       vector.body:
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_SPLIT_SPLIT:%.*]] ]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[INDEX]], i32 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP3:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP2]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = mul <vscale x 1 x i64> [[TMP3]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i32 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[DOTSPLAT]], [[TMP4]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; SCALABLE-NEXT:    br label [[VECTOR_BODY_SPLIT:%.*]]
+; SCALABLE:       vector.body.split:
+; SCALABLE-NEXT:    [[TMP8:%.*]] = phi i32 [ 0, [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY_SPLIT]] ]
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 1 x i64> [[TMP5]], i32 [[TMP8]]
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 1 x ptr> [[BROADCAST_SPLAT]], i32 [[TMP8]]
+; SCALABLE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
+; SCALABLE-NEXT:    [[TMP11]] = add i32 [[TMP8]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP8]], [[TMP7]]
+; SCALABLE-NEXT:    br i1 [[TMP12]], label [[VECTOR_BODY_SPLIT]], label [[VECTOR_BODY_SPLIT_SPLIT]]
+; SCALABLE:       vector.body.split.split:
+; SCALABLE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0
+; SCALABLE-NEXT:    store <vscale x 1 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP14]], align 8
+; SCALABLE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; SCALABLE-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE:       middle.block:
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
 ; SCALABLE:       for.body:
-; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SCALABLE-NEXT:    store i64 [[IV]], ptr [[B:%.*]], align 8
-; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
-; SCALABLE-NEXT:    store i64 [[V:%.*]], ptr [[ARRAYIDX]], align 8
+; SCALABLE-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    store i64 [[IV]], ptr [[B]], align 8
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; SCALABLE:       for.end:
 ; SCALABLE-NEXT:    ret void
 ;
@@ -1159,7 +1203,7 @@
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
 ; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1173,7 +1217,7 @@
 ; SCALABLE-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
 ; SCALABLE-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; SCALABLE:       for.end:
 ; SCALABLE-NEXT:    ret void
 ;