Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1103,6 +1103,12 @@
   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
 }
 
+/// Return the runtime value for VF.
+Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
+  Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
+  return VF.isScalable() ? B.CreateVScale(EC) : EC;
+}
+
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -2357,7 +2363,8 @@
   Value *ScalarInst = State.get(Def, Instance);
   Value *VectorValue = State.get(Def, Instance.Part);
   VectorValue = Builder.CreateInsertElement(
-      VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane));
+      VectorValue, ScalarInst,
+      Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
   State.set(Def, VectorValue, Instance.Part);
 }
 
@@ -2769,7 +2776,7 @@
     auto InputInstance = Instance;
     if (!Operand || !OrigLoop->contains(Operand) ||
         (Cost->isUniformAfterVectorization(Operand, State.VF)))
-      InputInstance.Lane = 0;
+      InputInstance.Lane = VPLane::getFirstLane();
     auto *NewOp = State.get(User.getOperand(op), InputInstance);
     Cloned->setOperand(op, NewOp);
   }
@@ -4285,22 +4292,21 @@
 
     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
     // Non-instruction incoming values will have only one value.
-    unsigned LastLane = 0;
-    if (isa<Instruction>(IncomingValue))
-      LastLane = Cost->isUniformAfterVectorization(
-                     cast<Instruction>(IncomingValue), VF)
-                     ? 0
-                     : VF.getKnownMinValue() - 1;
-    assert((!VF.isScalable() || LastLane == 0) &&
-           "scalable vectors dont support non-uniform scalars yet");
+
+    VPLane Lane = VPLane::getFirstLane();
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+    if (isa<Instruction>(IncomingValue) &&
+        !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
+                                           VF))
+      Lane = VPLane::getLastLaneForVF(VF);
+
     // Can be a loop invariant incoming value or the last scalar value to be
     // extracted from the vectorized loop.
-    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
     Value *lastIncomingValue =
         OrigLoop->isLoopInvariant(IncomingValue)
             ? IncomingValue
             : State.get(State.Plan->getVPValue(IncomingValue),
-                        VPIteration(UF - 1, LastLane));
+                        VPIteration(UF - 1, Lane));
     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
   }
 }
@@ -8981,7 +8987,7 @@
     // Insert scalar instance packing it into a vector.
     if (AlsoPack && State.VF.isVector()) {
       // If we're constructing lane 0, initialize to start from poison.
-      if (State.Instance->Lane == 0) {
+      if (State.Instance->Lane.isFirstLane()) {
         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
         Value *Poison = PoisonValue::get(
             VectorType::get(getUnderlyingValue()->getType(), State.VF));
@@ -9009,7 +9015,7 @@
   assert(State.Instance && "Branch on Mask works only on single instance.");
 
   unsigned Part = State.Instance->Part;
-  unsigned Lane = State.Instance->Lane;
+  unsigned Lane = State.Instance->Lane.getKnownLane();
 
   Value *ConditionBit = nullptr;
   VPValue *BlockInMask = getMask();
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -89,18 +89,104 @@
 /// vectorizer whereas the term "output IR" refers to code that is generated by
 /// the vectorizer.
 
+/// VPLane provides a way to access lanes in both fixed width and scalable
+/// vectors, where for the latter the lane index sometimes needs calculating
+/// as a runtime expression.
+class VPLane {
+public:
+  /// Kind describes how to interpret Lane.
+  enum class Kind : uint8_t {
+    /// For First, Lane is the index into the first N elements of a
+    /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
+    First,
+    /// For ScalableLast, Lane is the offset from the start of the last
+    /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
+    /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
+    /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
+    ScalableLast
+  };
+
+private:
+  /// in [0..VF)
+  unsigned Lane;
+
+  /// Indicates how the Lane should be interpreted, as described above.
+  Kind LaneKind;
+
+public:
+  VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
+
+  static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
+
+  static VPLane getLastLaneForVF(const ElementCount &VF) {
+    unsigned LaneOffset = VF.getKnownMinValue() - 1;
+    Kind LaneKind;
+    if (VF.isScalable())
+      // In this case 'LaneOffset' refers to the offset from the start of the
+      // last subvector with VF.getKnownMinValue() elements.
+      LaneKind = VPLane::Kind::ScalableLast;
+    else
+      LaneKind = VPLane::Kind::First;
+    return VPLane(LaneOffset, LaneKind);
+  }
+
+  /// Returns a compile-time known value for the lane index and asserts if the
+  /// lane can only be calculated at runtime.
+  unsigned getKnownLane() const {
+    assert(LaneKind == Kind::First);
+    return Lane;
+  }
+
+  /// Returns an expression describing the lane index that can be used at
+  /// runtime.
+  Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const;
+
+  /// Returns the Kind of lane offset.
+  Kind getKind() const { return LaneKind; }
+
+  /// Sets the lane offset and lane kind.
+  void set(unsigned L, Kind K) {
+    Lane = L;
+    LaneKind = K;
+  }
+
+  /// Returns true if this is the first lane of the whole vector.
+  bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
+
+  /// Maps the lane to a cache index based on \p VF.
+  unsigned mapToCacheIndex(const ElementCount &VF) const {
+    switch (LaneKind) {
+    case VPLane::Kind::ScalableLast:
+      assert(VF.isScalable() && Lane < VF.getKnownMinValue());
+      return VF.getKnownMinValue() + Lane;
+    default:
+      assert(Lane < VF.getKnownMinValue());
+      return Lane;
+    }
+  }
+
+  /// Returns the maxmimum number of lanes that we are able to consider
+  /// caching for \p VF.
+  static unsigned getNumCachedLanes(const ElementCount &VF) {
+    return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
+  }
+};
+
 /// VPIteration represents a single point in the iteration space of the output
 /// (vectorized and/or unrolled) IR loop.
 struct VPIteration {
   /// in [0..UF)
   unsigned Part;
 
-  /// in [0..VF)
-  unsigned Lane;
+  VPLane Lane;
+
+  VPIteration(unsigned Part, unsigned Lane,
+              VPLane::Kind Kind = VPLane::Kind::First)
+      : Part(Part), Lane(Lane, Kind) {}
 
-  VPIteration(unsigned Part, unsigned Lane) : Part(Part), Lane(Lane) {}
+  VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
 
-  bool isFirstIteration() const { return Part == 0 && Lane == 0; }
+  bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
 };
 
 /// VPTransformState holds information passed down when "executing" a VPlan,
@@ -157,9 +243,10 @@
     auto I = Data.PerPartScalars.find(Def);
     if (I == Data.PerPartScalars.end())
       return false;
+    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
     return Instance.Part < I->second.size() &&
-           Instance.Lane < I->second[Instance.Part].size() &&
-           I->second[Instance.Part][Instance.Lane];
+           CacheIdx < I->second[Instance.Part].size() &&
+           I->second[Instance.Part][CacheIdx];
   }
 
   /// Set the generated Value for a given VPValue and a given Part.
@@ -185,10 +272,11 @@
     while (PerPartVec.size() <= Instance.Part)
       PerPartVec.emplace_back();
     auto &Scalars = PerPartVec[Instance.Part];
-    while (Scalars.size() <= Instance.Lane)
+    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    while (Scalars.size() <= CacheIdx)
       Scalars.push_back(nullptr);
-    assert(!Scalars[Instance.Lane] && "should overwrite existing value");
-    Scalars[Instance.Lane] = V;
+    assert(!Scalars[CacheIdx] && "should overwrite existing value");
+    Scalars[CacheIdx] = V;
   }
 
   /// Reset an existing scalar value for \p Def and a given \p Instance.
@@ -198,9 +286,10 @@
            "need to overwrite existing value");
     assert(Instance.Part < Iter->second.size() &&
            "need to overwrite existing value");
-    assert(Instance.Lane < Iter->second[Instance.Part].size() &&
+    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    assert(CacheIdx < Iter->second[Instance.Part].size() &&
            "need to overwrite existing value");
-    Iter->second[Instance.Part][Instance.Lane] = V;
+    Iter->second[Instance.Part][CacheIdx] = V;
   }
 
   /// Hold state information used when constructing the CFG of the output IR,
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -47,6 +47,7 @@
 
 using namespace llvm;
 extern cl::opt<bool> EnableVPlanNativePath;
+extern Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
 
 #define DEBUG_TYPE "vplan"
 
@@ -58,6 +59,19 @@
   return OS;
 }
 
+Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder,
+                                const ElementCount &VF) const {
+  switch (LaneKind) {
+  case VPLane::Kind::ScalableLast:
+    // Lane = RuntimeVF - VF.getKnownMinValue() + Lane
+    return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF),
+                             Builder.getInt32(VF.getKnownMinValue() - Lane));
+  case VPLane::Kind::First:
+    return Builder.getInt32(Lane);
+  }
+  llvm_unreachable("Unknown lane kind");
+}
+
 VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
     : SubclassID(SC), UnderlyingVal(UV), Def(Def) {
   if (Def)
@@ -244,18 +258,20 @@
   if (!Def->getDef())
     return Def->getLiveInIRValue();
 
-  if (hasScalarValue(Def, Instance))
-    return Data.PerPartScalars[Def][Instance.Part][Instance.Lane];
+  if (hasScalarValue(Def, Instance)) {
+    return Data
+        .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
+  }
 
   assert(hasVectorValue(Def, Instance.Part));
   auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
   if (!VecPart->getType()->isVectorTy()) {
-    assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar");
+    assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
     return VecPart;
   }
   // TODO: Cache created scalar values.
-  auto *Extract =
-      Builder.CreateExtractElement(VecPart, Builder.getInt32(Instance.Lane));
+  Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
+  auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
   // set(Def, Extract, Instance);
   return Extract;
 }
@@ -427,7 +443,7 @@
     assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
     for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
          ++Lane) {
-      State->Instance->Lane = Lane;
+      State->Instance->Lane.set(Lane, VPLane::Kind::First);
       // Visit the VPBlocks connected to \p this, starting from it.
       for (VPBlockBase *Block : RPOT) {
         LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
@@ -0,0 +1,77 @@
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @inv_store_last_lane
+; CHECK: vector.body:
+; CHECK:  store <vscale x 4 x i32> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
+
+exit:              ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42
+  store i32 %mul, i32* %arrayidx5, align 4
+  ret void
+}
+
+define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @ret_last_lane
+; CHECK: vector.body:
+; CHECK:  store <vscale x 4 x float> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.body, %entry
+  ret float %mul
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !2, !3, !4, !5}
Index: llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
@@ -0,0 +1,61 @@
+; RUN: opt -loop-vectorize -dce -instcombine -S < %s 2>%t | FileCheck %s
+
+define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @inv_store_last_lane
+; CHECK: vector.body:
+; CHECK:  store <4 x i32> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK:  %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
+
+exit:              ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42
+  store i32 %mul, i32* %arrayidx5, align 4
+  ret void
+}
+
+define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @ret_last_lane
+; CHECK: vector.body:
+; CHECK:  store <4 x float> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK:  %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.body, %entry
+  ret float %mul
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !2, !3, !4, !5}