Index: llvm/lib/IR/IRBuilder.cpp
===================================================================
--- llvm/lib/IR/IRBuilder.cpp
+++ llvm/lib/IR/IRBuilder.cpp
@@ -528,14 +528,14 @@
 CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment,
                                             Value *Mask, Value *PassThru,
                                             const Twine &Name) {
-  auto *PtrsTy = cast<FixedVectorType>(Ptrs->getType());
+  auto *PtrsTy = cast<VectorType>(Ptrs->getType());
   auto *PtrTy = cast<PointerType>(PtrsTy->getElementType());
-  unsigned NumElts = PtrsTy->getNumElements();
-  auto *DataTy = FixedVectorType::get(PtrTy->getElementType(), NumElts);
+  ElementCount EC = PtrsTy->getElementCount();
+  auto *DataTy = VectorType::get(PtrTy->getElementType(), EC);
 
   if (!Mask)
     Mask = Constant::getAllOnesValue(
-        FixedVectorType::get(Type::getInt1Ty(Context), NumElts));
+        VectorType::get(Type::getInt1Ty(Context), EC));
 
   if (!PassThru)
     PassThru = UndefValue::get(DataTy);
@@ -558,20 +558,20 @@
 ///            be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
                                              Align Alignment, Value *Mask) {
-  auto *PtrsTy = cast<FixedVectorType>(Ptrs->getType());
-  auto *DataTy = cast<FixedVectorType>(Data->getType());
-  unsigned NumElts = PtrsTy->getNumElements();
+  auto *PtrsTy = cast<VectorType>(Ptrs->getType());
+  auto *DataTy = cast<VectorType>(Data->getType());
+  ElementCount EC = PtrsTy->getElementCount();
 
 #ifndef NDEBUG
   auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
-  assert(NumElts == DataTy->getNumElements() &&
+  assert(EC == DataTy->getElementCount() &&
          PtrTy->getElementType() == DataTy->getElementType() &&
          "Incompatible pointer and data types");
 #endif
 
   if (!Mask)
     Mask = Constant::getAllOnesValue(
-        FixedVectorType::get(Type::getInt1Ty(Context), NumElts));
+        VectorType::get(Type::getInt1Ty(Context), EC));
 
   Type *OverloadedTypes[] = {DataTy, PtrsTy};
   Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask};
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6428,8 +6428,18 @@
         // relying on instcombine to remove them.
         // Load: Scalar load + broadcast
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
-        unsigned Cost = getUniformMemOpCost(&I, VF);
-        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+
+        // For Scalable vectors, use G/S instructions instead of a scalarised
+        // store for now, until we add a mechanism to extract the last lane of
+        // the vector. This is allowed, since llvm.masked.scatter is guaranteed
+        // to be an ordered store (from first to last element).
+        if (VF.isScalable())
+          setWideningDecision(&I, VF, CM_GatherScatter,
+                              /*Cost=*/VF.getKnownMinValue());
+        else {
+          unsigned Cost = getUniformMemOpCost(&I, VF);
+          setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        }
         continue;
       }
 
@@ -6446,6 +6456,14 @@
         continue;
       }
 
+      // For Scalable vectors, all nodes must be widened using gather/scatter
+      // or interleaved loads/stores. For now, default to gather/scatter.
+      if (VF.isScalable()) {
+        setWideningDecision(&I, VF, CM_GatherScatter,
+                            /*Cost=*/VF.getKnownMinValue());
+        continue;
+      }
+
       // Choose between Interleaving, Gather/Scatter or Scalarization.
       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
       unsigned NumAccesses = 1;
Index: llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/scalable-loop-invariant-store-unpredicated-body-scalar-tail.ll
@@ -0,0 +1,68 @@
+; For now this test requires aarch64-registered-target, until we can
+; also pass the loop hint as a 'force-vector-width' flag to opt.
+; REQUIRES: aarch64-registered-target
+
+; RUN: opt -S -loop-vectorize -instcombine < %s | FileCheck %s
+
+source_filename = "loop.c"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; This test has an invariant store to a[42], which it vectorizes using a masked.scatter operation.
+
+; CHECK: for.body.preheader:
+; CHECK-DAG: %wide.trip.count = zext i32 %N to i64
+; CHECK-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECK-DAG: %min.iters.check = icmp ugt i64 %[[VSCALEX4]], %wide.trip.count
+
+; CHECK: vector.ph:
+; CHECK-DAG:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-DAG:  %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECK-DAG:  %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]]
+; CHECK:      %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf
+
+; CHECK: vector.body:
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %[[IDXB:.*]] = getelementptr inbounds double, double* %b, i64 %index
+; CHECK: %[[IDXB_CAST:.*]] = bitcast double* %[[IDXB]] to <vscale x 4 x double>*
+; CHECK: %wide.load = load <vscale x 4 x double>, <vscale x 4 x double>* %[[IDXB_CAST]], align 8, !alias.scope !0
+; CHECK: %[[FADD:.*]] = fadd <vscale x 4 x double> %wide.load, shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> undef, double 1.000000e+00, i32 0), <vscale x 4 x double> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK: %[[IDXA:.*]] = getelementptr inbounds double, double* %a, i64 42
+; CHECK: %[[SPLATINSERT:.*]] = insertelement <vscale x 4 x double*> undef, double* %[[IDXA]], i32 0
+; CHECK: %[[SPLAT:.*]] = shufflevector <vscale x 4 x double*> %[[SPLATINSERT]], <vscale x 4 x double*> undef, <vscale x 4 x i32> zeroinitializer
+; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p0f64(<vscale x 4 x double> %[[FADD]], <vscale x 4 x double*> %[[SPLAT]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> undef, i1 true, i32 0), <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer))
+; CHECK: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2
+; CHECK: %index.next = add i64 %index, %[[VSCALEX4]]
+; CHECK: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec
+; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !5
+
+define void @loop(i32 %N, double* nocapture %a, double* nocapture readonly %b) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %0, 1.000000e+00
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 42
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", !4}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{i32 4, i1 1}