diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4875,6 +4875,19 @@
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
 
+  if (NOutVT.isScalableVector()) {
+    unsigned NumOperands = N->getNumOperands();
+    SmallVector<SDValue, 16> ConcatOps(NumOperands);
+    EVT ConcatOpVT = EVT::getVectorVT(
+        *DAG.getContext(), NOutVT.getVectorElementType(),
+        N->getOperand(0).getValueType().getVectorElementCount());
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      ConcatOps[i] =
+          DAG.getNode(ISD::ANY_EXTEND, dl, ConcatOpVT, N->getOperand(i));
+    }
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NOutVT, ConcatOps);
+  }
+
   EVT OutElemTy = NOutVT.getVectorElementType();
 
   unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1301,6 +1301,34 @@
     return;
   }
 
+  // insert_subvector(Op, SubVec, 0) where SubVec widens to the result type
+  // can be converted to a vselect.
+  if (IdxVal == 0 && VecVT.isScalableVector() &&
+      TLI.getTypeToTransformTo(*DAG.getContext(), SubVecVT) == VecVT) {
+    SDValue WidenedSubVec = GetWidenedVector(SubVec);
+    EVT CmpElementVT = MVT::i32;
+    EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), CmpElementVT,
+                                 VecVT.getVectorElementCount());
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                  VecVT.getVectorElementCount());
+
+    SDLoc dl(N);
+    SDValue Step = DAG.getStepVector(dl, CmpVT);
+    unsigned NumElements = SubVec.getValueType().getVectorMinNumElements();
+    SDValue SplatNumElements = DAG.getSplatVector(
+        CmpVT, dl, DAG.getVScale(dl, CmpElementVT, APInt(32, NumElements)));
+    SDValue Mask =
+        DAG.getSetCC(dl, MaskVT, Step, SplatNumElements, ISD::SETULT);
+    SDValue Select =
+        DAG.getNode(ISD::VSELECT, dl, VecVT, Mask, WidenedSubVec, Vec);
+
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Lo.getValueType(), Select,
+                     DAG.getVectorIdxConstant(0, dl));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Hi.getValueType(), Select,
+                     DAG.getVectorIdxConstant(LoElems, dl));
+    return;
+  }
+
   // Spill the vector to the stack.
   // In cases where the vector is illegal it will be broken down into parts
   // and stored in parts - we should use the alignment for the smallest part.
@@ -3979,9 +4007,17 @@
   if (IdxVal == 0 && InVT == WidenVT)
     return InOp;
 
-  if (VT.isScalableVector())
+  if (VT.isScalableVector()) {
+    unsigned WidenNumElts = WidenVT.getVectorMinNumElements();
+    unsigned InNumElts = InVT.getVectorMinNumElements();
+    if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
+  }
+
+  if (InVT.isScalableVector()) {
     report_fatal_error("Don't know how to widen the result of "
                        "EXTRACT_SUBVECTOR for scalable vectors");
+  }
 
   // Check if we can extract from the vector.
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
@@ -4015,6 +4051,35 @@
 SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::LoadExtType ExtType = LD->getExtensionType();
+  EVT VT = N->getValueType(0);
+
+  // FIXME: Figure out how to replace constant "2".
+  if (VT.isScalableVector() &&
+      !VT.getVectorElementCount().isKnownMultipleOf(2)) {
+    // Convert load to masked load. Let MLOAD legalization handle widening.
+    // (We assume hardware with scalable vectors supports masked load/store.)
+    SDLoc dl(N);
+    EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                  VT.getVectorElementCount());
+    SDValue Mask = DAG.getAllOnesConstant(dl, MaskVT);
+    SDValue PassThru = DAG.getUNDEF(VT);
+
+    // Convert load to masked load. Let MLOAD legalization handle widening.
+    SDValue Res = DAG.getMaskedLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                                    LD->getOffset(), Mask, PassThru,
+                                    LD->getMemoryVT(), LD->getMemOperand(),
+                                    LD->getAddressingMode(), ExtType);
+
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+
+    // Widen the result.
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WidenVT,
+                       DAG.getUNDEF(WidenVT), Res,
+                       DAG.getVectorIdxConstant(0, dl));
+  }
 
   // A vector must always be stored in memory as-is, i.e. without any padding
   // between the elements, since various code depend on it, e.g. in the
@@ -4054,8 +4119,10 @@
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
+  assert(N->getAddressingMode() == ISD::UNINDEXED &&
+         "We shouldn't form indexed loads with illegal types");
 
-  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Mask = N->getMask();
   EVT MaskVT = Mask.getValueType();
   SDValue PassThru = GetWidenedVector(N->getPassThru());
@@ -4063,15 +4130,23 @@
   SDLoc dl(N);
 
   // The mask should be widened as well
-  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
-                                    MaskVT.getVectorElementType(),
-                                    WidenVT.getVectorNumElements());
+  EVT WideMaskVT =
+      EVT::getVectorVT(*DAG.getContext(), MaskVT.getVectorElementType(),
+                       WidenVT.getVectorElementCount());
   Mask = ModifyToType(Mask, WideMaskVT, true);
 
-  SDValue Res = DAG.getMaskedLoad(
-      WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
-      PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
-      ExtType, N->isExpandingLoad());
+  EVT MemVT = N->getMemoryVT();
+  EVT WideMemVT =
+      EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
+                       WidenVT.getVectorElementCount());
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MemOp = MF.getMachineMemOperand(
+      N->getMemOperand(), 0, MemoryLocation::UnknownSize);
+
+  SDValue Res =
+      DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
+                        N->getOffset(), Mask, PassThru, WideMemVT, MemOp,
+                        ISD::UNINDEXED, ExtType, N->isExpandingLoad());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -4830,6 +4905,9 @@
       return GetWidenedVector(N->getOperand(0));
   }
 
+  if (InVT.isScalableVector())
+    report_fatal_error("Cannot legalize this scalable CONCAT_VECTORS");
+
   // Otherwise, fall back to a nasty build vector.
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(NumElts);
@@ -4864,6 +4942,38 @@
       N->getConstantOperandVal(2) == 0)
     return SubVec;
 
+  if (InVec.getValueType().isScalableVector() &&
+      N->getConstantOperandVal(2) == 0) {
+    SDLoc dl(N);
+    EVT VT = InVec.getValueType();
+    EVT CmpElementVT = MVT::i32;
+    EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), CmpElementVT,
+                                 VT.getVectorElementCount());
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                  VT.getVectorElementCount());
+
+    if (SubVec.getValueType() != VT) {
+      // If the widened SubVec is still too narrow, widen it again.
+      unsigned NumConcat = VT.getVectorMinNumElements() /
+                           SubVec.getValueType().getVectorMinNumElements();
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      SDValue FillVal = DAG.getUNDEF(SubVec.getValueType());
+      Ops[0] = SubVec;
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = FillVal;
+
+      SubVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
+    }
+
+    SDValue Step = DAG.getStepVector(dl, CmpVT);
+    unsigned NumElements = VT.getVectorMinNumElements();
+    SDValue SplatNumElements = DAG.getSplatVector(
+        CmpVT, dl, DAG.getVScale(dl, CmpElementVT, APInt(32, NumElements)));
+    SDValue Mask =
+        DAG.getSetCC(dl, MaskVT, Step, SplatNumElements, ISD::SETULT);
+    return DAG.getNode(ISD::VSELECT, dl, VT, Mask, SubVec, InVec);
+  }
+
   report_fatal_error("Don't know how to widen the operands for "
                      "INSERT_SUBVECTOR");
 }
@@ -5542,8 +5652,20 @@
   if (InVT == NVT)
     return InOp;
 
-  unsigned InNumElts = InVT.getVectorNumElements();
-  unsigned WidenNumElts = NVT.getVectorNumElements();
+  unsigned InNumElts = InVT.getVectorMinNumElements();
+  unsigned WidenNumElts = NVT.getVectorMinNumElements();
+
+  if (NVT.isScalableVector()) {
+    if (WidenNumElts < InNumElts)
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+                         DAG.getVectorIdxConstant(0, dl));
+
+    SDValue FillVal =
+        FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
+                       DAG.getVectorIdxConstant(0, dl));
+  }
+
   if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
     unsigned NumConcat = WidenNumElts / InNumElts;
     SmallVector<SDValue, 16> Ops(NumConcat);
diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -9,7 +9,7 @@
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ret
-  %load = load <vscale x 4 x i16>, <vscale x 4 x i16>* %a
+  %load = load <vscale x 4 x i16>, <vscale x 4 x i16>* %a, align 1
   ret <vscale x 4 x i16> %load
 }
 
@@ -20,7 +20,7 @@
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ret
-  %load = load <vscale x 16 x i16>, <vscale x 16 x i16>* %a
+  %load = load <vscale x 16 x i16>, <vscale x 16 x i16>* %a, align 1
   ret <vscale x 16 x i16> %load
 }
 
@@ -32,10 +32,92 @@
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ret
-  %load = load <vscale x 24 x i16>, <vscale x 24 x i16>* %a
+  %load = load <vscale x 24 x i16>, <vscale x 24 x i16>* %a, align 1
   ret <vscale x 24 x i16> %load
 }
 
+define <vscale x 8 x i16> @load_widen_6i16(<vscale x 6 x i16>* %a) {
+; CHECK-LABEL: load_widen_6i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1h { z1.d }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %load = load <vscale x 6 x i16>, <vscale x 6 x i16>* %a, align 1
+  %r = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv6i16(<vscale x 8 x i16> undef, <vscale x 6 x i16> %load, i64 0)
+  ret <vscale x 8 x i16> %r
+}
+
+define <vscale x 4 x i32> @load_widen_1i32(<vscale x 1 x i32>* %a) {
+; CHECK-LABEL: load_widen_1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cmphi p2.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    uzp1 p1.s, p1.s, p0.s
+; CHECK-NEXT:    and p0.b, p0/z, p2.b, p1.b
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %load = load <vscale x 1 x i32>, <vscale x 1 x i32>* %a, align 1
+  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %load, i64 0)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @load_widen_3i32(<vscale x 3 x i32>* %a) {
+; CHECK-LABEL: load_widen_3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    cmphi p0.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %load = load <vscale x 3 x i32>, <vscale x 3 x i32>* %a, align 1
+  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv3i32(<vscale x 4 x i32> undef, <vscale x 3 x i32> %load, i64 0)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 8 x i32> @load_widen_6i32(<vscale x 6 x i32>* %a) {
+; CHECK-LABEL: load_widen_6i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ld1w { z1.d }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %load = load <vscale x 6 x i32>, <vscale x 6 x i32>* %a, align 1
+  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv6i32(<vscale x 8 x i32> undef, <vscale x 6 x i32> %load, i64 0)
+  ret <vscale x 8 x i32> %r
+}
+
+define <vscale x 8 x i32> @load_widen_7i32(<vscale x 7 x i32>* %a) {
+; CHECK-LABEL: load_widen_7i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntw x9
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w9
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    add z2.s, z0.s, z2.s
+; CHECK-NEXT:    cmphi p1.s, p0/z, z1.s, z0.s
+; CHECK-NEXT:    cmphi p0.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %load = load <vscale x 7 x i32>, <vscale x 7 x i32>* %a, align 1
+  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv7i32(<vscale x 8 x i32> undef, <vscale x 7 x i32> %load, i64 0)
+  ret <vscale x 8 x i32> %r
+}
+
 define <vscale x 32 x i16> @load_split_32i16(<vscale x 32 x i16>* %a) {
 ; CHECK-LABEL: load_split_32i16:
 ; CHECK:       // %bb.0:
@@ -45,7 +127,7 @@
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, #3, mul vl]
 ; CHECK-NEXT:    ret
-  %load = load <vscale x 32 x i16>, <vscale x 32 x i16>* %a
+  %load = load <vscale x 32 x i16>, <vscale x 32 x i16>* %a, align 1
   ret <vscale x 32 x i16> %load
 }
 
@@ -62,7 +144,7 @@
 ; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x0, #6, mul vl]
 ; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x0, #7, mul vl]
 ; CHECK-NEXT:    ret
-  %load = load <vscale x 16 x i64>, <vscale x 16 x i64>* %a
+  %load = load <vscale x 16 x i64>, <vscale x 16 x i64>* %a, align 1
   ret <vscale x 16 x i64> %load
 }
 
@@ -136,11 +218,133 @@
   ret <vscale x 8 x i64> %load
 }
 
-declare <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8>*, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
+define <vscale x 8 x i16> @masked_load_widen_6i16(<vscale x 6 x i16>* %a, <vscale x 8 x i1> %pg.wide) {
+; CHECK-LABEL: masked_load_widen_6i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    cmphi p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    cmphi p1.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
+; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    and p0.b, p2/z, p1.b, p0.b
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 6 x i1> @llvm.experimental.vector.extract.nxv6i1.nxv8i1(<vscale x 8 x i1> %pg.wide, i64 0)
+  %load = call <vscale x 6 x i16> @llvm.masked.load.nxv6i16(<vscale x 6 x i16> *%a, i32 1, <vscale x 6 x i1> %pg, <vscale x 6 x i16> undef)
+  %r = call <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv6i16(<vscale x 8 x i16> undef, <vscale x 6 x i16> %load, i64 0)
+  ret <vscale x 8 x i16> %r
+}
 
-declare <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16>*, i32, <vscale x 32 x i1>, <vscale x 32 x i16>)
+define <vscale x 4 x i32> @masked_load_widen_1i32(<vscale x 1 x i32>* %a, <vscale x 4 x i1> %pg.wide) {
+; CHECK-LABEL: masked_load_widen_1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cmphi p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    and p0.b, p1/z, p2.b, p0.b
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 1 x i1> @llvm.experimental.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1> %pg.wide, i64 0)
+  %load = call <vscale x 1 x i32> @llvm.masked.load.nxv1i32(<vscale x 1 x i32> *%a, i32 1, <vscale x 1 x i1> %pg, <vscale x 1 x i32> undef)
+  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> %load, i64 0)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 4 x i32> @masked_load_widen_3i32(<vscale x 3 x i32>* %a, <vscale x 4 x i1> %pg.wide) {
+; CHECK-LABEL: masked_load_widen_3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cmphi p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    and p0.b, p1/z, p2.b, p0.b
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 3 x i1> @llvm.experimental.vector.extract.nxv3i1.nxv4i1(<vscale x 4 x i1> %pg.wide, i64 0)
+  %load = call <vscale x 3 x i32> @llvm.masked.load.nxv3i32(<vscale x 3 x i32> *%a, i32 1, <vscale x 3 x i1> %pg, <vscale x 3 x i32> undef)
+  %r = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv3i32(<vscale x 4 x i32> undef, <vscale x 3 x i32> %load, i64 0)
+  ret <vscale x 4 x i32> %r
+}
+
+define <vscale x 8 x i32> @masked_load_widen_6i32(<vscale x 6 x i32>* %a, <vscale x 8 x i1> %pg.wide) {
+; CHECK-LABEL: masked_load_widen_6i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    cmphi p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    cmphi p1.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
+; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    and p0.b, p2/z, p1.b, p0.b
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p2.h, p0.h, p1.h
+; CHECK-NEXT:    zip2 p0.h, p0.h, p1.h
+; CHECK-NEXT:    ld1w { z0.s }, p2/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 6 x i1> @llvm.experimental.vector.extract.nxv6i1.nxv8i1(<vscale x 8 x i1> %pg.wide, i64 0)
+  %load = call <vscale x 6 x i32> @llvm.masked.load.nxv6i32(<vscale x 6 x i32> *%a, i32 1, <vscale x 6 x i1> %pg, <vscale x 6 x i32> undef)
+  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv6i32(<vscale x 8 x i32> undef, <vscale x 6 x i32> %load, i64 0)
+  ret <vscale x 8 x i32> %r
+}
 
+define <vscale x 8 x i32> @masked_load_widen_7i32(<vscale x 7 x i32>* %a, <vscale x 8 x i1> %pg.wide) {
+; CHECK-LABEL: masked_load_widen_7i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    cmphi p2.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    cmphi p1.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
+; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    and p0.b, p2/z, p1.b, p0.b
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    zip1 p2.h, p0.h, p1.h
+; CHECK-NEXT:    zip2 p0.h, p0.h, p1.h
+; CHECK-NEXT:    ld1w { z0.s }, p2/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ret
+  %pg = call <vscale x 7 x i1> @llvm.experimental.vector.extract.nxv7i1.nxv8i1(<vscale x 8 x i1> %pg.wide, i64 0)
+  %load = call <vscale x 7 x i32> @llvm.masked.load.nxv7i32(<vscale x 7 x i32> *%a, i32 1, <vscale x 7 x i1> %pg, <vscale x 7 x i32> undef)
+  %r = call <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv7i32(<vscale x 8 x i32> undef, <vscale x 7 x i32> %load, i64 0)
+  ret <vscale x 8 x i32> %r
+}
+
+declare <vscale x 32 x i8> @llvm.masked.load.nxv32i8(<vscale x 32 x i8>*, i32, <vscale x 32 x i1>, <vscale x 32 x i8>)
+declare <vscale x 6 x i16> @llvm.masked.load.nxv6i16(<vscale x 6 x i16>*, i32, <vscale x 6 x i1>, <vscale x 6 x i16>)
+declare <vscale x 32 x i16> @llvm.masked.load.nxv32i16(<vscale x 32 x i16>*, i32, <vscale x 32 x i1>, <vscale x 32 x i16>)
+declare <vscale x 1 x i32> @llvm.masked.load.nxv1i32(<vscale x 1 x i32>*, i32, <vscale x 1 x i1>, <vscale x 1 x i32>)
 declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 3 x i32> @llvm.masked.load.nxv3i32(<vscale x 3 x i32>*, i32, <vscale x 3 x i1>, <vscale x 3 x i32>)
+declare <vscale x 6 x i32> @llvm.masked.load.nxv6i32(<vscale x 6 x i32>*, i32, <vscale x 6 x i1>, <vscale x 6 x i32>)
+declare <vscale x 7 x i32> @llvm.masked.load.nxv7i32(<vscale x 7 x i32>*, i32, <vscale x 7 x i1>, <vscale x 7 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.load.nxv8i32(<vscale x 8 x i32>*, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
-
 declare <vscale x 8 x i64> @llvm.masked.load.nxv8i64(<vscale x 8 x i64>*, i32, <vscale x 8 x i1>, <vscale x 8 x i64>)
+declare <vscale x 8 x i16> @llvm.experimental.vector.insert.nxv8i16.nxv6i16(<vscale x 8 x i16>, <vscale x 6 x i16>, i64)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv3i32(<vscale x 4 x i32>, <vscale x 3 x i32>, i64)
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv6i32(<vscale x 8 x i32>, <vscale x 6 x i32>, i64)
+declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv8i32.nxv7i32(<vscale x 8 x i32>, <vscale x 7 x i32>, i64)
+declare <vscale x 1 x i1> @llvm.experimental.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1>, i64)
+declare <vscale x 3 x i1> @llvm.experimental.vector.extract.nxv3i1.nxv4i1(<vscale x 4 x i1>, i64)
+declare <vscale x 6 x i1> @llvm.experimental.vector.extract.nxv6i1.nxv8i1(<vscale x 8 x i1>, i64)
+declare <vscale x 7 x i1> @llvm.experimental.vector.extract.nxv7i1.nxv8i1(<vscale x 8 x i1>, i64)