diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -618,19 +618,27 @@
   ElementCount PartNumElts = PartVT.getVectorElementCount();
   ElementCount ValueNumElts = ValueVT.getVectorElementCount();
 
-  // We only support widening vectors with equivalent element types and
-  // fixed/scalable properties. If a target needs to widen a fixed-length type
-  // to a scalable one, it should be possible to use INSERT_SUBVECTOR below.
+  // We only support widening vectors with equivalent fixed/scalable properties.
+  // If a target needs to widen a fixed-length type to a scalable one, it should
+  // be possible to use INSERT_SUBVECTOR below.
   if (ElementCount::isKnownLE(PartNumElts, ValueNumElts) ||
-      PartNumElts.isScalable() != ValueNumElts.isScalable() ||
-      PartVT.getVectorElementType() != ValueVT.getVectorElementType())
+      PartNumElts.isScalable() != ValueNumElts.isScalable())
     return SDValue();
 
   // Widening a scalable vector to another scalable vector is done by inserting
   // the vector into a larger undef one.
-  if (PartNumElts.isScalable())
+  if (PartNumElts.isScalable()) {
+    EVT PromotedVT = EVT::getVectorVT(*DAG.getContext(), PartVT.getScalarType(),
+                                      ValueVT.getVectorElementCount());
+    if (PartVT.getVectorElementType() != ValueVT.getVectorElementType() &&
+        PartVT.getVectorElementType().isInteger())
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PromotedVT);
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
                        Val, DAG.getVectorIdxConstant(0, DL));
+  }
+
+  if (PartVT.getVectorElementType() != ValueVT.getVectorElementType())
+    return SDValue();
 
   EVT ElementVT = PartVT.getVectorElementType();
   // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -36,7 +36,7 @@
   ret <vscale x 24 x i16> %load
 }
 
-define <vscale x 8 x i16> @load_widen_6i16(<vscale x 6 x i16>* %a) {
+define <vscale x 6 x i16> @load_widen_6i16(<vscale x 6 x i16>* %a) {
 ; CHECK-LABEL: load_widen_6i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -47,11 +47,10 @@
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %load = load <vscale x 6 x i16>, <vscale x 6 x i16>* %a, align 1
-  %r = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv6i16(<vscale x 8 x i16> undef, <vscale x 6 x i16> %load, i64 0)
-  ret <vscale x 8 x i16> %r
+  ret <vscale x 6 x i16> %load
 }
 
-define <vscale x 4 x i32> @load_widen_1i32(<vscale x 1 x i32>* %a) {
+define <vscale x 1 x i32> @load_widen_1i32(<vscale x 1 x i32>* %a) {
 ; CHECK-LABEL: load_widen_1i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntw x8
@@ -65,11 +64,10 @@
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 1 x i32>, <vscale x 1 x i32>* %a, align 1
-  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %load, i64 0)
-  ret <vscale x 4 x i32> %r
+  ret <vscale x 1 x i32> %load
 }
 
-define <vscale x 4 x i32> @load_widen_3i32(<vscale x 3 x i32>* %a) {
+define <vscale x 3 x i32> @load_widen_3i32(<vscale x 3 x i32>* %a) {
 ; CHECK-LABEL: load_widen_3i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntw x8
@@ -80,11 +78,10 @@
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 3 x i32>, <vscale x 3 x i32>* %a, align 1
-  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv3i32(<vscale x 4 x i32> undef, <vscale x 3 x i32> %load, i64 0)
-  ret <vscale x 4 x i32> %r
+  ret <vscale x 3 x i32> %load
 }
 
-define <vscale x 8 x i32> @load_widen_6i32(<vscale x 6 x i32>* %a) {
+define <vscale x 6 x i32> @load_widen_6i32(<vscale x 6 x i32>* %a) {
 ; CHECK-LABEL: load_widen_6i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
@@ -94,11 +91,10 @@
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
   %load = load <vscale x 6 x i32>, <vscale x 6 x i32>* %a, align 1
-  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv6i32(<vscale x 8 x i32> undef, <vscale x 6 x i32> %load, i64 0)
-  ret <vscale x 8 x i32> %r
+  ret <vscale x 6 x i32> %load
 }
 
-define <vscale x 8 x i32> @load_widen_7i32(<vscale x 7 x i32>* %a) {
+define <vscale x 7 x i32> @load_widen_7i32(<vscale x 7 x i32>* %a) {
 ; CHECK-LABEL: load_widen_7i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntw x9
@@ -114,8 +110,7 @@
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ret
   %load = load <vscale x 7 x i32>, <vscale x 7 x i32>* %a, align 1
-  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv7i32(<vscale x 8 x i32> undef, <vscale x 7 x i32> %load, i64 0)
-  ret <vscale x 8 x i32> %r
+  ret <vscale x 7 x i32> %load
 }
 
 define <vscale x 32 x i16> @load_split_32i16(<vscale x 32 x i16>* %a) {
@@ -218,7 +213,7 @@
   ret <vscale x 8 x i64> %load
 }
 
-define <vscale x 8 x i16> @masked_load_widen_6i16(<vscale x 6 x i16>* %a, <vscale x 8 x i1> %pg.wide) {
+define <vscale x 6 x i16> @masked_load_widen_6i16(<vscale x 6 x i16>* %a, <vscale x 6 x i1> %pg) {
 ; CHECK-LABEL: masked_load_widen_6i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cnth x8
@@ -235,30 +230,27 @@
 ; CHECK-NEXT:    and p0.b, p2/z, p1.b, p0.b
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 6 x i1> @llvm.experimental.vector.extract.nxv6i1.nxv8i1(<vscale x 8 x i1> %pg.wide, i64 0)
   %load = call <vscale x 6 x i16> @llvm.masked.load.nxv6i16(<vscale x 6 x i16> *%a, i32 1, <vscale x 6 x i1> %pg, <vscale x 6 x i16> undef)
-  %r = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv6i16(<vscale x 8 x i16> undef, <vscale x 6 x i16> %load, i64 0)
-  ret <vscale x 8 x i16> %r
+  ret <vscale x 6 x i16> %load
 }
 
-define <vscale x 4 x i32> @masked_load_widen_1i32(<vscale x 1 x i32>* %a, <vscale x 4 x i1> %pg.wide) {
+define <vscale x 1 x i32> @masked_load_widen_1i32(<vscale x 1 x i32>* %a, <vscale x 1 x i1> %pg) {
 ; CHECK-LABEL: masked_load_widen_1i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmphi p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p0.s
 ; CHECK-NEXT:    and p0.b, p1/z, p2.b, p0.b
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 1 x i1> @llvm.experimental.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1> %pg.wide, i64 0)
   %load = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32(<vscale x 1 x i32> *%a, i32 1, <vscale x 1 x i1> %pg, <vscale x 1 x i32> undef)
-  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %load, i64 0)
-  ret <vscale x 4 x i32> %r
+  ret <vscale x 1 x i32> %load
 }
 
-define <vscale x 4 x i32> @masked_load_widen_3i32(<vscale x 3 x i32>* %a, <vscale x 4 x i1> %pg.wide) {
+define <vscale x 3 x i32> @masked_load_widen_3i32(<vscale x 3 x i32>* %a, <vscale x 3 x i1> %pg) {
 ; CHECK-LABEL: masked_load_widen_3i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntw x8
@@ -269,13 +261,11 @@
 ; CHECK-NEXT:    and p0.b, p1/z, p2.b, p0.b
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 3 x i1> @llvm.experimental.vector.extract.nxv3i1.nxv4i1(<vscale x 4 x i1> %pg.wide, i64 0)
   %load = call <vscale x 3 x i32> @llvm.masked.load.nxv3i32(<vscale x 3 x i32> *%a, i32 1, <vscale x 3 x i1> %pg, <vscale x 3 x i32> undef)
-  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv3i32(<vscale x 4 x i32> undef, <vscale x 3 x i32> %load, i64 0)
-  ret <vscale x 4 x i32> %r
+  ret <vscale x 3 x i32> %load
 }
 
-define <vscale x 8 x i32> @masked_load_widen_6i32(<vscale x 6 x i32>* %a, <vscale x 8 x i1> %pg.wide) {
+define <vscale x 6 x i32> @masked_load_widen_6i32(<vscale x 6 x i32>* %a, <vscale x 6 x i1> %pg) {
 ; CHECK-LABEL: masked_load_widen_6i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cnth x8
@@ -296,13 +286,11 @@
 ; CHECK-NEXT:    ld1w { z0.s }, p2/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 6 x i1> @llvm.experimental.vector.extract.nxv6i1.nxv8i1(<vscale x 8 x i1> %pg.wide, i64 0)
   %load = call <vscale x 6 x i32> @llvm.masked.load.nxv6i32(<vscale x 6 x i32> *%a, i32 1, <vscale x 6 x i1> %pg, <vscale x 6 x i32> undef)
-  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv6i32(<vscale x 8 x i32> undef, <vscale x 6 x i32> %load, i64 0)
-  ret <vscale x 8 x i32> %r
+  ret <vscale x 6 x i32> %load
 }
 
-define <vscale x 8 x i32> @masked_load_widen_7i32(<vscale x 7 x i32>* %a, <vscale x 8 x i1> %pg.wide) {
+define <vscale x 7 x i32> @masked_load_widen_7i32(<vscale x 7 x i32>* %a, <vscale x 7 x i1> %pg) {
 ; CHECK-LABEL: masked_load_widen_7i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cnth x8
@@ -323,10 +311,8 @@
 ; CHECK-NEXT:    ld1w { z0.s }, p2/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ret
-  %pg = call <vscale x 7 x i1> @llvm.experimental.vector.extract.nxv7i1.nxv8i1(<vscale x 8 x i1> %pg.wide, i64 0)
   %load = call <vscale x 7 x i32> @llvm.masked.load.nxv7i32(<vscale x 7 x i32> *%a, i32 1, <vscale x 7 x i1> %pg, <vscale x 7 x i32> undef)
-  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv7i32(<vscale x 8 x i32> undef, <vscale x 7 x i32> %load, i64 0)
-  ret <vscale x 8 x i32> %r
+  ret <vscale x 7 x i32> %load
 }
 
 declare <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8>*, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
@@ -339,12 +325,3 @@
 declare <vscale x 7 x i32> @llvm.masked.load.nxv7i32(<vscale x 7 x i32>*, i32, <vscale x 7 x i1>, <vscale x 7 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32>*, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
 declare <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64>*, i32, <vscale x 8 x i1>, <vscale x 8 x i64>)
-declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv6i16(<vscale x 8 x i16>, <vscale x 6 x i16>, i64)
-declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv3i32(<vscale x 4 x i32>, <vscale x 3 x i32>, i64)
-declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv6i32(<vscale x 8 x i32>, <vscale x 6 x i32>, i64)
-declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv7i32(<vscale x 8 x i32>, <vscale x 7 x i32>, i64)
-declare <vscale x 1 x i1> @llvm.experimental.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1>, i64)
-declare <vscale x 3 x i1> @llvm.experimental.vector.extract.nxv3i1.nxv4i1(<vscale x 4 x i1>, i64)
-declare <vscale x 6 x i1> @llvm.experimental.vector.extract.nxv6i1.nxv8i1(<vscale x 8 x i1>, i64)
-declare <vscale x 7 x i1> @llvm.experimental.vector.extract.nxv7i1.nxv8i1(<vscale x 8 x i1>, i64)