diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14792,8 +14792,8 @@
   return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }
 
-static inline int numVectorEltsOrZero(EVT T) {
-  return T.isVector() ? T.getVectorNumElements() : 0;
+static inline ElementCount numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
 }
 
 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
@@ -14861,6 +14861,24 @@
   EVT STMemType = ST->getMemoryVT();
   EVT STType = ST->getValue().getValueType();
 
+  // There are two cases to consider here:
+  //  1. The store is fixed width and the load is scalable. In this case we
+  //     don't know at compile time if the store completely envelops the load
+  //     so we abandon the optimisation.
+  //  2. The store is scalable and the load is fixed width. We could
+  //     potentially support a limited number of cases here, but there has been
+  //     no cost-benefit analysis to prove it's worth it.
+  bool LdStScalable = LDMemType.isScalableVector();
+  if (LdStScalable != STMemType.isScalableVector())
+    return SDValue();
+
+  // If we are dealing with scalable vectors on a big endian platform the
+  // calculation of offsets below becomes trickier, since we do not know at
+  // compile time the absolute size of the vector. Until we've done more
+  // analysis on big-endian platforms it seems better to bail out for now.
+  if (LdStScalable && DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
   BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
   BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
   int64_t Offset;
@@ -14872,13 +14890,21 @@
   // the stored value). With Offset=n (for n > 0) the loaded value starts at the
   // n:th least significant byte of the stored value.
   if (DAG.getDataLayout().isBigEndian())
-    Offset = ((int64_t)STMemType.getStoreSizeInBits() -
-              (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset;
+    Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
+              (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
+                 8 -
+             Offset;
 
   // Check that the stored value cover all bits that are loaded.
-  bool STCoversLD =
-      (Offset >= 0) &&
-      (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+  bool STCoversLD;
+
+  TypeSize LdMemSize = LDMemType.getSizeInBits();
+  TypeSize StMemSize = STMemType.getSizeInBits();
+  if (LdStScalable)
+    STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
+  else
+    STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
+                                   StMemSize.getFixedSize());
 
   auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
     if (LD->isIndexed()) {
@@ -14899,15 +14925,15 @@
   // Memory as copy space (potentially masked).
   if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
     // Simple case: Direct non-truncating forwarding
-    if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
+    if (LDType.getSizeInBits() == LdMemSize)
       return ReplaceLd(LD, ST->getValue(), Chain);
     // Can we model the truncate and extension with an and mask?
     if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
         !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
       // Mask to size of LDMemType
       auto Mask =
-          DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
-                                               STMemType.getSizeInBits()),
+          DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
+                                               StMemSize.getFixedSize()),
                           SDLoc(ST), STType);
       auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
       return ReplaceLd(LD, Val, Chain);
diff --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+define <vscale x 2 x i64> @sti64ldi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  %0 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x double> @stf64ldf64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
+; CHECK-LABEL: stf64ldf64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  %0 = load <vscale x 2 x double>, <vscale x 2 x double>* %arrayidx1
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x i64> @sti32ldi32ext(<vscale x 2 x i32>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti32ldi32ext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z1.d, p0/m, z0.d
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %0 = trunc <vscale x 2 x i64> %v to <vscale x 2 x i32>
+  store <vscale x 2 x i32> %0, <vscale x 2 x i32>* %P
+  %1 = load <vscale x 2 x i32>, <vscale x 2 x i32>* %P
+  %2 = sext <vscale x 2 x i32> %1 to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %2
+}
+
+define <2 x i64> @sti64ldfixedi64(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldfixedi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = bitcast <vscale x 2 x i64>* %arrayidx0 to <2 x i64>*
+  %0 = load <2 x i64>, <2 x i64>* %arrayidx1
+  ret <2 x i64> %0
+}
+
+define <vscale x 4 x i32> @sti64ldi32(<vscale x 2 x i64>* nocapture %P, <vscale x 2 x i64> %v) {
+; CHECK-LABEL: sti64ldi32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast <vscale x 2 x i64>* %P to <vscale x 4 x i32>*
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %P, i64 1
+  store <vscale x 2 x i64> %v, <vscale x 2 x i64>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 4 x i32>, <vscale x 4 x i32>* %0, i64 1
+  %1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %arrayidx1
+  ret <vscale x 4 x i32> %1
+}
+
+define <vscale x 2 x i64> @stf64ldi64(<vscale x 2 x double>* nocapture %P, <vscale x 2 x double> %v) {
+; CHECK-LABEL: stf64ldi64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %0 = bitcast <vscale x 2 x double>* %P to <vscale x 2 x i64>*
+  %arrayidx0 = getelementptr inbounds <vscale x 2 x double>, <vscale x 2 x double>* %P, i64 1
+  store <vscale x 2 x double> %v, <vscale x 2 x double>* %arrayidx0
+  %arrayidx1 = getelementptr inbounds <vscale x 2 x i64>, <vscale x 2 x i64>* %0, i64 1
+  %1 = load <vscale x 2 x i64>, <vscale x 2 x i64>* %arrayidx1
+  ret <vscale x 2 x i64> %1
+}