Index: llvm/include/llvm/IR/IRBuilder.h
===================================================================
--- llvm/include/llvm/IR/IRBuilder.h
+++ llvm/include/llvm/IR/IRBuilder.h
@@ -861,6 +861,10 @@
                              Type *ResultType,
                              const Twine &Name = "");
 
+  /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale
+  /// will be the same type as that of \p Scaling.
+  Value *CreateVScale(Constant *Scaling, const Twine &Name = "");
+
   /// Create a call to intrinsic \p ID with 1 operand which is mangled on its
   /// type.
   CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
Index: llvm/lib/IR/IRBuilder.cpp
===================================================================
--- llvm/lib/IR/IRBuilder.cpp
+++ llvm/lib/IR/IRBuilder.cpp
@@ -80,6 +80,17 @@
   return CI;
 }
 
+Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  assert(isa<ConstantInt>(Scaling) && "Expected constant integer");
+  Function *TheFn =
+      Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
+  CallInst *CI = createCallHelper(TheFn, {}, this, Name);
+  return cast<ConstantInt>(Scaling)->getSExtValue() == 1
+             ? CI
+             : CreateMul(CI, Scaling);
+}
+
 CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MaybeAlign Align, bool isVolatile,
                                       MDNode *TBAATag, MDNode *ScopeTag,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -962,6 +962,15 @@
   return R;
 }
 
+/// Return a value for Step multiplied by VF.
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
+  assert(isa<ConstantInt>(Step) && "Expected an integer step");
+  Constant *StepVal = ConstantInt::get(
+      Step->getType(),
+      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+}
+
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -2101,8 +2110,6 @@
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF.isVector() && "VF should be greater than one");
-  assert(!VF.isScalable() &&
-         "the code below assumes a fixed number of elements at compile time");
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
   assert(ScalarIVTy == Step->getType() &&
@@ -2127,11 +2134,24 @@
       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
           ? 1
           : VF.getKnownMinValue();
+  assert((!VF.isScalable() || Lanes == 1) &&
+         "Should never scalarize a scalable vector");
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   for (unsigned Part = 0; Part < UF; ++Part) {
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = getSignedIntOrFpConstant(
-          ScalarIVTy, VF.getKnownMinValue() * Part + Lane);
+      auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+                                         ScalarIVTy->getScalarSizeInBits());
+      Value *StartIdx =
+          createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+      if (ScalarIVTy->isFloatingPointTy())
+        StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
+      StartIdx = addFastMathFlag(Builder.CreateBinOp(
+          AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane)));
+      // The step returned by `createStepForVF` is a runtime-evaluated value
+      // when VF is scalable. Otherwise, it should be folded into a Constant.
+      assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
+             "Expected StartIdx to be folded to a constant when VF is not "
+             "scalable");
       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
@@ -2174,10 +2194,11 @@
     // is known to be uniform after vectorization, this corresponds to lane zero
     // of the Part unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the Part unroll iteration.
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
                             ? 0
                             : VF.getKnownMinValue() - 1;
+    assert((!VF.isScalable() || LastLane == 0) &&
+           "Scalable vectorization can't lead to any scalarized values.");
     auto *LastInst = cast<Instruction>(
         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
 
@@ -2522,7 +2543,6 @@
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
 
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *DataTy = VectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
 
@@ -2555,6 +2575,9 @@
       InBounds = gep->isInBounds();
 
     if (Reverse) {
+      assert(!VF.isScalable() &&
+             "Reversing vectors is not yet supported for scalable vectors.");
+
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
@@ -2566,8 +2589,9 @@
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
+      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
     }
 
@@ -2764,8 +2788,7 @@
 
   Type *Ty = TC->getType();
   // This is where we can make the step a runtime constant.
-  assert(!VF.isScalable() && "scalable vectorization is not supported yet");
-  Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
+  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -2776,6 +2799,8 @@
   if (Cost->foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
+    assert(!VF.isScalable() &&
+           "Tail folding not yet supported for scalable vectors");
     TC = Builder.CreateAdd(
         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
   }
@@ -2854,11 +2879,9 @@
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
   if (!Cost->foldTailByMasking()) {
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    CheckMinIters = Builder.CreateICmp(
-        P, Count,
-        ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
-        "min.iters.check");
+    Value *Step =
+        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
   }
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
@@ -3314,8 +3337,8 @@
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
+  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Induction =
       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -4161,7 +4184,6 @@
 }
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
-  assert(!VF.isScalable() && "the code below assumes fixed width vectors");
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
@@ -4172,6 +4194,8 @@
                        cast<Instruction>(IncomingValue), VF)
                        ? 0
                        : VF.getKnownMinValue() - 1;
+      assert((!VF.isScalable() || LastLane == 0) &&
+             "scalable vectors dont support non-uniform scalars yet");
       // Can be a loop invariant incoming value or the last scalar value to be
       // extracted from the vectorized loop.
       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -5395,6 +5419,9 @@
 
 VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
+  // FIXME: This can be fixed for scalable vectors later, because at this stage
+  // the LoopVectorizer will only consider vectorizing a loop with scalable
+  // vectors when the loop has a hint to enable vectorization for a given VF.
   assert(!MaxVF.isScalable() && "scalable vectors not yet supported");
 
   float Cost = expectedCost(ElementCount::getFixed(1)).first;
@@ -5586,7 +5613,6 @@
   }
 
   // Clamp the interleave ranges to reasonable counts.
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   unsigned MaxInterleaveCount =
       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
 
@@ -5601,6 +5627,13 @@
 
   // If trip count is known or estimated compile time constant, limit the
   // interleave count to be less than the trip count divided by VF.
+  //
+  // For scalable vectors we can't know if interleaving is beneficial. It may
+  // not be beneficial for small loops if none of the lanes in the second vector
+  // iterations is enabled. However, for larger loops, there is likely to be a
+  // similar benefit as for fixed-width vectors. For now, we choose to leave
+  // the InterleaveCount as if vscale is '1', although if some information about
+  // the vector is known (e.g. min vector size), we can make a better decision.
   if (BestKnownTC) {
     MaxInterleaveCount =
         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
@@ -5636,7 +5669,7 @@
   // potentially expose ILP opportunities.
   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
                     << "LV: IC is " << IC << '\n'
-                    << "LV: VF is " << VF.getKnownMinValue() << '\n');
+                    << "LV: VF is " << VF << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
@@ -6307,8 +6340,6 @@
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
-  assert(!VF.isScalable() &&
-         "the cost model is not yet implemented for scalable vectorization");
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (isUniformAfterVectorization(I, VF))
@@ -6372,7 +6403,6 @@
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (VF.isScalar())
     return;
   NumPredStores = 0;
@@ -6961,7 +6991,6 @@
 
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
-  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
@@ -6996,6 +7025,8 @@
 
   ElementCount MaxVF = MaybeMaxVF.getValue();
   assert(MaxVF.isNonZero() && "MaxVF is zero.");
+  assert(!MaxVF.isScalable() &&
+         "Scalable vectors not yet supported beyond this point");
 
   for (ElementCount VF = ElementCount::getFixed(1);
        ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
@@ -8060,6 +8091,7 @@
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
+    assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
     State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance,
                                     IsPredicated, State);
     // Insert scalar instance packing it into a vector.
@@ -8080,6 +8112,8 @@
   // instruction is uniform inwhich case generate only the first lane for each
   // of the UF parts.
   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
+  assert((!State.VF.isScalable() || IsUniform) &&
+         "Can't scalarize a scalable vector");
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane},
@@ -8223,12 +8257,6 @@
 
   // Get user vectorization factor.
   ElementCount UserVF = Hints.getWidth();
-  if (UserVF.isScalable()) {
-    // TODO: Use scalable UserVF once we've added initial support for scalable
-    // vectorization. For now we convert it to fixed width, but this will be
-    // removed in a later patch.
-    UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
-  }
 
   // Plan how to best vectorize, return the best VF and its cost.
   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
@@ -8394,13 +8422,6 @@
 
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
-  if (UserVF.isScalable()) {
-    // TODO: Use scalable UserVF once we've added initial support for scalable
-    // vectorization. For now we convert it to fixed width, but this will be
-    // removed in a later patch.
-    UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
-  }
-
   unsigned UserIC = Hints.getInterleave();
 
   // Plan how to best vectorize, return the best VF and its cost.
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -163,7 +163,6 @@
     assert(Instance.Part < UF && "Queried Scalar Part is too large.");
     assert(Instance.Lane < VF.getKnownMinValue() &&
            "Queried Scalar Lane is too large.");
-    assert(!VF.isScalable() && "VF is assumed to be non scalable.");
 
     if (!hasAnyScalarValue(Key))
       return false;
Index: llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -0,0 +1,101 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=1 < %s | FileCheck %s --check-prefix=CHECKUF1
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-interleave=2 < %s | FileCheck %s --check-prefix=CHECKUF2
+
+; CHECKUF1: for.body.preheader:
+; CHECKUF1-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
+
+; CHECKUF1: vector.ph:
+; CHECKUF1-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
+; CHECKUF1:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECKUF1: vector.body:
+; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECKUF1: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECKUF1: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECKUF1: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECKUF1: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECKUF1: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECKUF1: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECKUF1: %index.next = add i64 %index, %[[VSCALEX4]]
+; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+
+; For an interleave factor of 2, vscale is scaled by 8 instead of 4 (and thus shifted left by 3 instead of 2).
+; There is also the increment for the next iteration, e.g. instead of indexing IDXB, it indexes at IDXB + vscale * 4.
+
+; CHECKUF2: for.body.preheader:
+; CHECKUF2-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX8]], %wide.trip.count
+
+; CHECKUF2: vector.ph:
+; CHECKUF2-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2-DAG:  %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]]
+; CHECKUF2:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECKUF2: vector.body:
+; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECKUF2: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECKUF2: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
+; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXB]], i64 %[[VSCALE2_EXT]]
+; CHECKUF2: %[[IDXB_NEXT_CAST:.*]] = bitcast double* %[[IDXB_NEXT]] to <vscale x 4 x double>*
+; CHECKUF2: %wide.load{{[0-9]+}} = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_NEXT_CAST]], align 8, !alias.scope !0
+; CHECKUF2: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF2: %[[FADD_NEXT:.*]] = fadd <vscale x 4 x double> %wide.load{{[0-9]+}}, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECKUF2: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECKUF2: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF2: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECKUF2: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECKUF2: %[[VSCALE2_EXT:.*]] = sext i32 %[[VSCALE2]] to i64
+; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, double* %[[IDXA]], i64 %[[VSCALE2_EXT]]
+; CHECKUF2: %[[IDXA_NEXT_CAST:.*]] = bitcast double* %[[IDXA_NEXT]] to <vscale x 4 x double>*
+; CHECKUF2: store <vscale x 4 x double> %[[FADD_NEXT]], <vscale x 4 x double>* %[[IDXA_NEXT_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3
+; CHECKUF2: %index.next = add i64 %index, %[[VSCALEX8]]
+; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.width", !3}
+!3 = !{i32 4, i1 true}