Index: llvm/include/llvm/IR/IRBuilder.h
===================================================================
--- llvm/include/llvm/IR/IRBuilder.h
+++ llvm/include/llvm/IR/IRBuilder.h
@@ -861,6 +861,10 @@
                              Type *ResultType,
                              const Twine &Name = "");
 
+  /// Create a call to llvm.vscale, multiplied by \p Scaling. The type of VScale
+  /// will be the same type as that of \p Scaling.
+  Value *CreateVScale(Constant *Scaling, const Twine &Name = "");
+
   /// Create a call to intrinsic \p ID with 1 operand which is mangled on its
   /// type.
   CallInst *CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
Index: llvm/lib/IR/IRBuilder.cpp
===================================================================
--- llvm/lib/IR/IRBuilder.cpp
+++ llvm/lib/IR/IRBuilder.cpp
@@ -80,6 +80,17 @@
   return CI;
 }
 
+Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  assert (isa<ConstantInt>(Scaling) && "Expected constant integer");
+  Function *TheFn =
+      Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()});
+  CallInst *CI = createCallHelper(TheFn, {}, this, Name);
+  return cast<ConstantInt>(Scaling)->getSExtValue() == 1
+             ? CI
+             : CreateMul(CI, Scaling);
+}
+
 CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MaybeAlign Align, bool isVolatile,
                                       MDNode *TBAATag, MDNode *ScopeTag,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -348,7 +348,6 @@
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type at the given vectorization factor.
 static bool hasIrregularType(Type *Ty, const DataLayout &DL, ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   // Determine if an array of VF elements of type Ty is "bitcast compatible"
   // with a <VF x Ty> vector.
   if (VF.isVector()) {
@@ -963,6 +962,15 @@
   return R;
 }
 
+/// Return a value for Step multiplied by VF.
+static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
+  assert(isa<ConstantInt>(Step) && "Expected an integer step");
+  Constant *StepVal = ConstantInt::get(
+      Step->getType(),
+      cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
+  return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
+}
+
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -1225,9 +1233,7 @@
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
   InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    assert(VF.isVector() && "Expected VF >=2");
-
+    assert(VF.isVector() && "Expected VF to be a vector VF");
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
     if (EnableVPlanNativePath)
@@ -2104,8 +2110,6 @@
                                            const InductionDescriptor &ID) {
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF.isVector() && "VF should be greater than one");
-  assert(!VF.isScalable() &&
-         "the code below assumes a fixed number of elements at compile time");
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
   assert(ScalarIVTy == Step->getType() &&
@@ -2130,6 +2134,8 @@
       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
           ? 1
           : VF.getKnownMinValue();
+  assert((!VF.isScalable() || Lanes == 1) &&
+         "Should never scalarize a scalable vector");
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   for (unsigned Part = 0; Part < UF; ++Part) {
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
@@ -2177,10 +2183,11 @@
     // is known to be uniform after vectorization, this corresponds to lane zero
     // of the Part unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the Part unroll iteration.
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
                             ? 0
                             : VF.getKnownMinValue() - 1;
+    assert((!VF.isScalable() || LastLane == 0) &&
+           "Scalable vectorization can't lead to any scalarized values.");
     auto *LastInst = cast<Instruction>(
         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
 
@@ -2525,7 +2532,6 @@
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
 
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   auto *DataTy = VectorType::get(ScalarDataTy, VF);
   const Align Alignment = getLoadStoreAlignment(Instr);
 
@@ -2558,19 +2564,25 @@
       InBounds = gep->isInBounds();
 
     if (Reverse) {
+      Value *Increment = createStepForVF(Builder, Builder.getInt32(-Part), VF);
+
       // If the address is consecutive but reversed, then the
       // wide store needs to start at the last vector element.
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
+          ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
+      Value *Offset =
+          Builder.CreateSub(Builder.getInt32(1),
+                            createStepForVF(Builder, Builder.getInt32(1), VF));
       PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
+          ScalarDataTy, PartPtr, Offset));
       PartPtr->setIsInBounds(InBounds);
       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
     } else {
-      PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
-          ScalarDataTy, Ptr, Builder.getInt32(Part * VF.getKnownMinValue())));
+      Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
     }
 
@@ -2766,10 +2778,8 @@
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
 
   Type *Ty = TC->getType();
-
   // This is where we can make the step a runtime constant.
-  assert(!VF.isScalable() && "scalable vectorization is not supported yet");
-  Constant *Step = ConstantInt::get(Ty, VF.getKnownMinValue() * UF);
+  Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
 
   // If the tail is to be folded by masking, round the number of iterations N
   // up to a multiple of Step instead of rounding down. This is done by first
@@ -2780,6 +2790,8 @@
   if (Cost->foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
+    assert(!VF.isScalable() &&
+           "Tail folding not yet supported for scalable vectors");
     TC = Builder.CreateAdd(
         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
   }
@@ -2858,11 +2870,9 @@
   // If tail is to be folded, vector loop takes care of all iterations.
   Value *CheckMinIters = Builder.getFalse();
   if (!Cost->foldTailByMasking()) {
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
-    CheckMinIters = Builder.CreateICmp(
-        P, Count,
-        ConstantInt::get(Count->getType(), VF.getKnownMinValue() * UF),
-        "min.iters.check");
+    Value *Step =
+        createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
+    CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
   }
   // Create new preheader for vector loop.
   LoopVectorPreHeader =
@@ -3318,8 +3328,8 @@
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
+  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
+  Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   Induction =
       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
@@ -3699,8 +3709,8 @@
   // profile is not inherently precise anyway. Note also possible bypass of
   // vector code caused by legality checks is ignored, assigning all the weight
   // to the vector loop, optimistically.
-  assert(!VF.isScalable() &&
-         "cannot use scalable ElementCount to determine unroll factor");
+
+  // TODO: Consider changing this calculation for scalable vectors.
   setProfileInfoAfterUnrolling(
       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
@@ -4163,7 +4173,6 @@
 }
 
 void InnerLoopVectorizer::fixLCSSAPHIs() {
-  assert(!VF.isScalable() && "the code below assumes fixed width vectors");
   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
     if (LCSSAPhi.getNumIncomingValues() == 1) {
       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
@@ -4174,6 +4183,8 @@
                        cast<Instruction>(IncomingValue), VF)
                        ? 0
                        : VF.getKnownMinValue() - 1;
+      assert((!VF.isScalable() || LastLane == 0) &&
+             "scalable vectors dont support non-uniform scalars yet");
       // Can be a loop invariant incoming value or the last scalar value to be
       // extracted from the vectorized loop.
       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -4505,7 +4516,6 @@
 
 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
                                            VPTransformState &State) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   switch (I.getOpcode()) {
   case Instruction::Call:
   case Instruction::Br:
@@ -4894,7 +4904,6 @@
 
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
                                                          ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
@@ -5592,7 +5601,6 @@
   }
 
   // Clamp the interleave ranges to reasonable counts.
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   unsigned MaxInterleaveCount =
       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
 
@@ -5794,7 +5802,9 @@
     if (Ty->isTokenTy())
       return 0U;
     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
-    assert(!VF.isScalable() && "scalable vectors not yet supported.");
+    // This assert can be removed, because the answer probably wouldn't be
+    // any different than for fixed-width vectors.
+    // assert(!VF.isScalable() && "scalable vectors not yet supported.");
     return std::max<unsigned>(1, VF.getKnownMinValue() * TypeSize /
                                      WidestRegister);
   };
@@ -6072,7 +6082,6 @@
 
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6319,13 +6328,22 @@
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                ElementCount VF) {
-  assert(!VF.isScalable() &&
-         "the cost model is not yet implemented for scalable vectorization");
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (isUniformAfterVectorization(I, VF))
     VF = ElementCount::getFixed(1);
 
+  // FIXME: Implement a proper cost-model for scalable vectors.
+  // For now disable the LoopVectorizationCostModel for scalable vectors,
+  // because there are too many code-paths that have either:
+  // `assert(!VF.isScalable())`  or  `cast<FixedVectorType>(..)`.
+  // After we fix up those code-paths, we can remove this shortcut.
+  // Because the only way to vectorize a loop using scalable vectors is by
+  // forcing it through the LoopHints, the cost-model for scalable vectors
+  // is not yet relevant anyway.
+  if (VF.isScalable())
+    return {1, true};
+
   if (VF.isVector() && isProfitableToScalarize(I, VF))
     return VectorizationCostTy(InstsToScalarize[VF][I], false);
 
@@ -6384,7 +6402,6 @@
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   if (VF.isScalar())
     return;
   NumPredStores = 0;
@@ -6973,7 +6990,6 @@
 
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
-  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   Optional<ElementCount> MaybeMaxVF =
       CM.computeMaxVF(UserVF, UserIC);
@@ -8077,6 +8093,7 @@
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
+    assert(!State.VF.isScalable() && "Can't scalarise a scalable vector");
     State.ILV->scalarizeInstruction(Ingredient, *this, *State.Instance,
                                     IsPredicated, State);
     // Insert scalar instance packing it into a vector.
@@ -8097,6 +8114,8 @@
   // instruction is uniform inwhich case generate only the first lane for each
   // of the UF parts.
   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
+  assert((!State.VF.isScalable() || IsUniform) &&
+         "Can't scalarise a scalable vector");
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(Ingredient, *this, {Part, Lane},
Index: llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll
@@ -0,0 +1,65 @@
+; For now this test requires aarch64-registered-target, until we can
+; also pass the loop hint as a 'force-vector-width' flag to opt.
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -S -loop-vectorize -instcombine < %s | FileCheck %s
+
+source_filename = "loop.c"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: for.body.preheader:
+; CHECK-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECK-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
+
+; CHECK: vector.ph:
+; CHECK-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECK-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
+; CHECK:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECK: vector.body:
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECK: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECK: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 %index
+; CHECK: %[[IDXA_CAST:.*]] = bitcast double* %[[IDXA]] to <vscale x 4 x double>*
+; CHECK: store <vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %[[IDXA_CAST]], align 8, !alias.scope !3, !noalias !0
+; CHECK: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECK: %index.next = add i64 %index, %[[VSCALEX4]]
+; CHECK: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", !4}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{i32 4, i1 1}