Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1143,6 +1143,12 @@
   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
 }
 
+/// Return the runtime value for VF.
+static Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
+  Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
+  return VF.isScalable() ? B.CreateVScale(EC) : EC;
+}
+
 namespace llvm {
 
 void reportVectorizationFailure(const StringRef DebugMsg,
@@ -2458,7 +2464,7 @@
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Instance.Lane > 0
+  assert((Instance.Lane > 0 || Instance.isNonConstLane())
              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
              : true && "Uniform values only have lane zero");
 
@@ -2478,10 +2484,19 @@
     return U;
   }
 
+  Value *Lane;
+  if (Instance.isNonConstLane()) {
+    assert(Instance.Kind == VPIteration::LK_ScalableLast &&
+           "Cannot handle other non-constant lane types");
+    Lane = Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF),
+                             Builder.getInt32(1 + Instance.Lane));
+  } else
+    Lane = Builder.getInt32(Instance.Lane);
+
   // Otherwise, the value from the original loop has been vectorized and is
   // represented by UF vector values. Extract and return the requested scalar
   // value from the appropriate vector lane.
-  return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+  return Builder.CreateExtractElement(U, Lane);
 }
 
 void InnerLoopVectorizer::packScalarIntoVectorValue(
@@ -2492,6 +2507,7 @@
 
   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+  assert(!Instance.isNonConstLane() && "Only constant lane indices supported");
   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
                                             Builder.getInt32(Instance.Lane));
   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
@@ -2883,7 +2899,8 @@
   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
   // the first lane and part.
   if (isa<NoAliasScopeDeclInst>(Instr))
-    if (Instance.Lane != 0 || Instance.Part != 0)
+    if (Instance.Lane != 0 || Instance.Part != 0 ||
+        Instance.Kind != VPIteration::LK_First)
       return;
 
   setDebugLocFromInst(Builder, Instr);
@@ -2901,8 +2918,10 @@
     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
     auto InputInstance = Instance;
     if (!Operand || !OrigLoop->contains(Operand) ||
-        (Cost->isUniformAfterVectorization(Operand, State.VF)))
+        (Cost->isUniformAfterVectorization(Operand, State.VF))) {
       InputInstance.Lane = 0;
+      InputInstance.Kind = VPIteration::LK_First;
+    }
     auto *NewOp = State.get(User.getOperand(op), InputInstance);
     Cloned->setOperand(op, NewOp);
   }
@@ -4395,19 +4414,24 @@
 
     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
     // Non-instruction incoming values will have only one value.
-    unsigned LastLane = 0;
-    if (isa<Instruction>(IncomingValue))
-      LastLane = Cost->isUniformAfterVectorization(
-                     cast<Instruction>(IncomingValue), VF)
-                     ? 0
-                     : VF.getKnownMinValue() - 1;
-    assert((!VF.isScalable() || LastLane == 0) &&
-           "scalable vectors dont support non-uniform scalars yet");
+
+    unsigned Lane = 0;
+    VPIteration::LaneKind Kind = VPIteration::LK_First;
+    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+    if (isa<Instruction>(IncomingValue) &&
+        !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
+                                           VF)) {
+      if (VF.isScalable())
+        // In this case 'Lane' refers to the lane offset from the last lane
+        Kind = VPIteration::LK_ScalableLast;
+      else
+        Lane = VF.getKnownMinValue() - 1;
+    }
+
     // Can be a loop invariant incoming value or the last scalar value to be
     // extracted from the vectorized loop.
-    Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
     Value *lastIncomingValue =
-      getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
+        getOrCreateScalarValue(IncomingValue, {UF - 1, Lane, Kind});
     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
   }
 }
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -97,6 +97,24 @@
 
   /// in [0..VF)
   unsigned Lane;
+
+  /// There are cases where we wish to extract a particular lane of a vector,
+  /// but don't know the exact value at compile time. For example, we may wish
+  /// to extract the last lane of a scalable vector and this requires using a
+  /// runtime calculation. The enum below describes how the 'Lane' is
+  /// interpreted. The default is LK_First, which means the lane offset from
+  /// the start of the vector. If you want to extract a lane at an offset from
+  /// the end, i.e. LastLane - Lane, you can set the lane 'Kind' to
+  /// LK_ScalableLast.
+  enum LaneKind {
+    LK_First,
+    LK_ScalableLast,
+  };
+
+  /// Indicates whether Lane starts from beginning or works back from the end.
+  LaneKind Kind;
+
+  bool isNonConstLane() const { return Kind != LK_First; }
 };
 
 /// This is a helper struct for maintaining vectorization state. It's used for
@@ -158,6 +176,20 @@
     return ScalarMapStorage.count(Key);
   }
 
+  /// Returns the number of lanes that we are able to cache.
+  unsigned getNumCachedLanes() const { return 2 * VF.getKnownMinValue(); }
+
+  /// Maps the Lane in \p Instance to a cache index.
+  unsigned mapInstanceLaneToIndex(const VPIteration &Instance) const {
+    switch (Instance.Kind) {
+    case VPIteration::LK_ScalableLast:
+      return VF.getKnownMinValue() + Instance.Lane;
+    default:
+      assert(Instance.Kind == VPIteration::LK_First);
+      return Instance.Lane;
+    }
+  }
+
   /// \return True if the map has a scalar entry for \p Key and \p Instance.
   bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
     assert(Instance.Part < UF && "Queried Scalar Part is too large.");
@@ -168,9 +200,10 @@
       return false;
     const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
     assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
-    assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
+    assert(Entry[Instance.Part].size() == getNumCachedLanes() &&
            "ScalarParts has wrong dimensions.");
-    return Entry[Instance.Part][Instance.Lane] != nullptr;
+    unsigned CacheIdx = mapInstanceLaneToIndex(Instance);
+    return Entry[Instance.Part][CacheIdx] != nullptr;
   }
 
   /// Retrieve the existing vector value that corresponds to \p Key and
@@ -184,7 +217,8 @@
   /// \p Instance.
   Value *getScalarValue(Value *Key, const VPIteration &Instance) {
     assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
-    return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
+    unsigned CacheIdx = mapInstanceLaneToIndex(Instance);
+    return ScalarMapStorage[Key][Instance.Part][CacheIdx];
   }
 
   /// Set a vector value associated with \p Key and \p Part. Assumes such a
@@ -207,10 +241,11 @@
       // TODO: Consider storing uniform values only per-part, as they occupy
       //       lane 0 only, keeping the other VF-1 redundant entries null.
       for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part].resize(VF.getKnownMinValue(), nullptr);
+        Entry[Part].resize(getNumCachedLanes(), nullptr);
       ScalarMapStorage[Key] = Entry;
     }
-    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+    unsigned CacheIdx = mapInstanceLaneToIndex(Instance);
+    ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar;
   }
 
   /// Reset the vector value associated with \p Key for the given \p Part.
@@ -230,7 +265,8 @@
                         Value *Scalar) {
     assert(hasScalarValue(Key, Instance) &&
            "Scalar value not set for part and lane");
-    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+    unsigned CacheIdx = mapInstanceLaneToIndex(Instance);
+    ScalarMapStorage[Key][Instance.Part][CacheIdx] = Scalar;
   }
 };
 
@@ -300,6 +336,7 @@
     auto I = Data.PerPartScalars.find(Def);
     if (I == Data.PerPartScalars.end())
       return false;
+    assert(!Instance.isNonConstLane() && "Non-constant lanes unsupported");
     return Instance.Part < I->second.size() &&
            Instance.Lane < I->second[Instance.Part].size() &&
            I->second[Instance.Part][Instance.Lane];
@@ -321,6 +358,7 @@
     while (PerPartVec.size() <= Instance.Part)
       PerPartVec.emplace_back();
     auto &Scalars = PerPartVec[Instance.Part];
+    assert(!Instance.isNonConstLane() && "Non-constant lanes unsupported");
     while (Scalars.size() <= Instance.Lane)
       Scalars.push_back(nullptr);
     Scalars[Instance.Lane] = V;
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -220,6 +220,9 @@
   if (!Def->getDef() && OrigLoop->isLoopInvariant(Def->getLiveInIRValue()))
     return Def->getLiveInIRValue();
 
+  assert(!Instance.isNonConstLane() &&
+         "Non-constant lanes need handling with callbacks!");
+
   if (hasScalarValue(Def, Instance))
     return Data.PerPartScalars[Def][Instance.Part][Instance.Lane];
 
@@ -287,7 +290,8 @@
 
 void VPBasicBlock::execute(VPTransformState *State) {
   bool Replica = State->Instance &&
-                 !(State->Instance->Part == 0 && State->Instance->Lane == 0);
+                 !(State->Instance->Part == 0 && State->Instance->Lane == 0 &&
+                   State->Instance->Kind == VPIteration::LK_First);
   VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
   VPBlockBase *SingleHPred = nullptr;
   BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
@@ -401,7 +405,7 @@
   assert(!State->Instance && "Replicating a Region with non-null instance.");
 
   // Enter replicating mode.
-  State->Instance = {0, 0};
+  State->Instance = {0, 0, VPIteration::LK_First};
 
   for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
     State->Instance->Part = Part;
Index: llvm/test/Transforms/LoopVectorize/AArch64/neon-extract-last-veclane.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/neon-extract-last-veclane.ll
@@ -0,0 +1,73 @@
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+neon -S < %s 2>%t | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @inv_store_last_lane
+; CHECK: vector.body:
+; CHECK:  store <4 x i32> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK:  %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3
+
+entry:
+  %cmp12 = icmp sgt i64 %n, 0
+  br i1 %cmp12, label %for.body, label %for.cond.cleanup
+
+for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
+  %mul.lcssa = phi i32 [ %mul, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42
+  store i32 %mul.lcssa, i32* %arrayidx5, align 4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !0
+}
+
+define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @ret_last_lane
+; CHECK: vector.body:
+; CHECK:  store <4 x float> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK:  %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3
+
+entry:
+  %cmp12 = icmp sgt i64 %n, 0
+  br i1 %cmp12, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %inv.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %mul, %for.body ]
+  ret float %inv.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !6
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon" }
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !2, !3, !4, !5}
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
@@ -0,0 +1,84 @@
+; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @inv_store_last_lane
+; CHECK: vector.body:
+; CHECK:  store <vscale x 4 x i32> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
+
+entry:
+  %cmp12 = icmp sgt i64 %n, 0
+  br i1 %cmp12, label %for.body, label %for.cond.cleanup
+
+for.cond.for.cond.cleanup_crit_edge:              ; preds = %for.body
+  %mul.lcssa = phi i32 [ %mul, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42
+  store i32 %mul.lcssa, i32* %arrayidx5, align 4
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !0
+}
+
+define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @ret_last_lane
+; CHECK: vector.body:
+; CHECK:  store <vscale x 4 x float> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
+
+entry:
+  %cmp12 = icmp sgt i64 %n, 0
+  br i1 %cmp12, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %inv.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %mul, %for.body ]
+  ret float %inv.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, 2.000000e+00
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !6
+}
+
+attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !2, !3, !4, !5}