diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -919,6 +919,8 @@
 
   setTargetDAGCombine(ISD::MUL);
 
+  setTargetDAGCombine(ISD::FP_EXTEND);
+
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::VSELECT);
 
@@ -15260,6 +15262,99 @@
   return SDValue();
 }
 
+static SDValue performFpExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                      const AArch64Subtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (!VT.isFixedLengthVector())
+    return SDValue();
+
+  if (DAG.getTargetLoweringInfo().isTypeLegal(VT) ||
+      !Subtarget->useSVEForFixedLengthVectors())
+    return SDValue();
+
+  // In cases where the result of the FP_EXTEND is not legal, it will be
+  // expanded into multiple extract_subvectors which cannot be lowered without
+  // going through memory.
+  //
+  // If we push an extend into the load feeding the FP_EXTEND, we can force the
+  // load to be be expanded into the same number of parts as the FP_EXTEND,
+  // avoiding the need for extract_subvectors completely.
+  //
+  // As part of the lowering of FP_EXTEND for fixed length types uunpklo nodes
+  // will be introduced which will then combine with the truncate introduced
+  // after the load.
+  if (ISD::isNormalLoad(Op.getNode())) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+
+    // Check if there are other uses. If so, do not combine as it will introduce
+    // an extra load.
+    for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
+         UI != UE; ++UI) {
+      if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+        continue;
+      if (*UI != N)
+        return SDValue();
+    }
+
+    SDValue NewLoad = DAG.getExtLoad(
+        ISD::ZEXTLOAD, DL, VT.changeTypeToInteger(), LD->getChain(),
+        LD->getBasePtr(), LD->getMemoryVT().changeTypeToInteger(),
+        LD->getMemOperand());
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+
+    SDValue Trunc = DAG.getNode(
+        ISD::TRUNCATE, DL, Op->getValueType(0).changeTypeToInteger(), NewLoad);
+    SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Trunc);
+
+    return DAG.getNode(ISD::FP_EXTEND, DL, VT, Bitcast);
+  }
+
+  return SDValue();
+}
+
+static SDValue performUunpkloCombine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // uunpklo(uzp1(x, x)) where x = bitcast(zextload) -> x
+  if (Op->getOpcode() == AArch64ISD::UZP1) {
+    EVT HalfVT = Op.getValueType();
+
+    // Ensure the unzip input is the same size as the unpack output
+    if (Op->getOperand(0)->getOpcode() != ISD::BITCAST ||
+        Op->getValueType(0) == VT)
+      return SDValue();
+
+    SDValue Bitcast = Op->getOperand(0);
+
+    // Look through bitcasts and unzips
+    SDValue Input = Bitcast->getOperand(0);
+    while (Input->getOpcode() == ISD::BITCAST ||
+           (Input->getOpcode() == AArch64ISD::UZP1 &&
+            Input->getOperand(0) == Input->getOperand(1)))
+      Input = Input->getOperand(0);
+
+    // Input should come from an extending load
+    if (!isa<MaskedLoadSDNode>(Input) ||
+        cast<MaskedLoadSDNode>(Input)->getExtensionType() != ISD::ZEXTLOAD)
+      return SDValue();
+
+    // Ensure that we don't care about the top half of the input
+    EVT MemVT = cast<MaskedLoadSDNode>(Input)->getMemoryVT();
+    if (isPackedVectorType(MemVT, DAG) &&
+        MemVT.getVectorElementType().getScalarSizeInBits() <=
+            HalfVT.getScalarSizeInBits())
+      return Bitcast->getOperand(0);
+  }
+
+  return SDValue();
+}
+
 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
   unsigned Opc = N->getOpcode();
 
@@ -16905,6 +17000,8 @@
     return performUzpCombine(N, DAG);
   case AArch64ISD::SETCC_MERGE_ZERO:
     return performSetccMergeZeroCombine(N, DAG);
+  case ISD::FP_EXTEND:
+    return performFpExtendCombine(N, DAG, Subtarget);
   case AArch64ISD::GLD1_MERGE_ZERO:
   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
@@ -16923,6 +17020,8 @@
   case AArch64ISD::VASHR:
   case AArch64ISD::VLSHR:
     return performVectorShiftCombine(N, *this, DCI);
+  case AArch64ISD::UUNPKLO:
+    return performUunpkloCombine(N, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return performInsertVectorEltCombine(N, DCI);
   case ISD::EXTRACT_VECTOR_ELT:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -61,31 +61,17 @@
 }
 
 define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
-; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently.
+; Ensure sensible type legalisation.
 ; VBITS_EQ_256-LABEL: fcvt_v16f16_v16f32:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_EQ_256-NEXT:    sub x9, sp, #48
-; VBITS_EQ_256-NEXT:    mov x29, sp
-; VBITS_EQ_256-NEXT:    and sp, x9, #0xffffffffffffffe0
-; VBITS_EQ_256-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_EQ_256-NEXT:    .cfi_offset w30, -8
-; VBITS_EQ_256-NEXT:    .cfi_offset w29, -16
-; VBITS_EQ_256-NEXT:    ptrue p0.h, vl16
-; VBITS_EQ_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    mov x8, sp
-; VBITS_EQ_256-NEXT:    st1h { z0.h }, p0, [x8]
-; VBITS_EQ_256-NEXT:    ldp q0, q1, [sp]
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_EQ_256-NEXT:    mov x8, #8
-; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    uunpklo z1.s, z1.h
+; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
+; VBITS_EQ_256-NEXT:    ld1h { z0.s }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.s }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    fcvt z0.s, p0/m, z0.h
 ; VBITS_EQ_256-NEXT:    fcvt z1.s, p0/m, z1.h
-; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
-; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_EQ_256-NEXT:    mov sp, x29
-; VBITS_EQ_256-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32:
@@ -184,16 +170,12 @@
 ; Ensure sensible type legalisation.
 ; VBITS_EQ_256-LABEL: fcvt_v8f16_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    ldr q0, [x0]
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_EQ_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_EQ_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_EQ_256-NEXT:    uunpklo z1.d, z1.s
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT:    fcvt z1.d, p0/m, z1.h
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    ld1h { z0.d }, p0/z, [x0, x8, lsl #1]
+; VBITS_EQ_256-NEXT:    ld1h { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    fcvt z0.d, p0/m, z0.h
+; VBITS_EQ_256-NEXT:    fcvt z1.d, p0/m, z1.h
 ; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
 ; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
@@ -288,31 +270,17 @@
 }
 
 define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
-; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently.
+; Ensure sensible type legalisation.
 ; VBITS_EQ_256-LABEL: fcvt_v8f32_v8f64:
 ; VBITS_EQ_256:       // %bb.0:
-; VBITS_EQ_256-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; VBITS_EQ_256-NEXT:    sub x9, sp, #48
-; VBITS_EQ_256-NEXT:    mov x29, sp
-; VBITS_EQ_256-NEXT:    and sp, x9, #0xffffffffffffffe0
-; VBITS_EQ_256-NEXT:    .cfi_def_cfa w29, 16
-; VBITS_EQ_256-NEXT:    .cfi_offset w30, -8
-; VBITS_EQ_256-NEXT:    .cfi_offset w29, -16
-; VBITS_EQ_256-NEXT:    ptrue p0.s, vl8
-; VBITS_EQ_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_EQ_256-NEXT:    mov x8, sp
-; VBITS_EQ_256-NEXT:    st1w { z0.s }, p0, [x8]
-; VBITS_EQ_256-NEXT:    ldp q0, q1, [sp]
-; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_EQ_256-NEXT:    mov x8, #4
-; VBITS_EQ_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_EQ_256-NEXT:    uunpklo z1.d, z1.s
+; VBITS_EQ_256-NEXT:    ptrue p0.d, vl4
+; VBITS_EQ_256-NEXT:    ld1w { z0.d }, p0/z, [x0, x8, lsl #2]
+; VBITS_EQ_256-NEXT:    ld1w { z1.d }, p0/z, [x0]
 ; VBITS_EQ_256-NEXT:    fcvt z0.d, p0/m, z0.s
 ; VBITS_EQ_256-NEXT:    fcvt z1.d, p0/m, z1.s
-; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1, x8, lsl #3]
-; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_EQ_256-NEXT:    mov sp, x29
-; VBITS_EQ_256-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; VBITS_EQ_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
+; VBITS_EQ_256-NEXT:    st1d { z1.d }, p0, [x1]
 ; VBITS_EQ_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64: