diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -929,6 +929,7 @@
     SDValue SubV = Node->getOperand(1);
     SDLoc DL(SubV);
     auto Idx = Node->getConstantOperandVal(2);
+    MVT XLenVT = Subtarget->getXLenVT();
     MVT SubVecVT = Node->getOperand(1).getSimpleValueType();
 
     // TODO: This method of selecting INSERT_SUBVECTOR should work
@@ -936,24 +937,6 @@
     // correctly identify the canonical register class for fixed-length types.
     // For now, keep the two paths separate.
     if (VT.isScalableVector() && SubVecVT.isScalableVector()) {
-      bool IsFullVecReg = false;
-      switch (RISCVTargetLowering::getLMUL(SubVecVT)) {
-      default:
-        break;
-      case RISCVVLMUL::LMUL_1:
-      case RISCVVLMUL::LMUL_2:
-      case RISCVVLMUL::LMUL_4:
-      case RISCVVLMUL::LMUL_8:
-        IsFullVecReg = true;
-        break;
-      }
-
-      // If the subvector doesn't occupy a full vector register then we can't
-      // insert it purely using subregister manipulation. We must not clobber
-      // the untouched elements (say, in the upper half of the VR register).
-      if (!IsFullVecReg)
-        break;
-
       const auto *TRI = Subtarget->getRegisterInfo();
       unsigned SubRegIdx;
       std::tie(SubRegIdx, Idx) =
@@ -966,9 +949,32 @@
       if (Idx != 0)
         break;
 
+      RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+      bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 ||
+                             SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
+                             SubVecLMUL == RISCVVLMUL::LMUL_F8;
+      (void)IsSubVecPartReg; // Silence unused variable warning without asserts.
+      assert((!IsSubVecPartReg || V.isUndef()) &&
+             "Expecting lowering to have created legal INSERT_SUBVECTORs when "
+             "the subvector is smaller than a full-sized register");
+
+      // If we haven't set a SubRegIdx, then we must be going between LMUL<=1
+      // types (VR -> VR). This can be done as a copy.
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
+        assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecVT) ==
+                   RISCV::VRRegClassID &&
+               InRegClassID == RISCV::VRRegClassID &&
+               "Unexpected subvector extraction");
+        SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT);
+        SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                                 DL, VT, SubV, RC);
+        return ReplaceNode(Node, NewNode);
+      }
+
       SDNode *NewNode = CurDAG->getMachineNode(
           TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV,
-          CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT()));
+          CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT));
       return ReplaceNode(Node, NewNode);
     }
 
@@ -981,8 +987,7 @@
 
       unsigned RegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
 
-      SDValue RC =
-          CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
+      SDValue RC = CurDAG->getTargetConstant(RegClassID, DL, XLenVT);
       SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                                DL, VT, SubV, RC);
       ReplaceNode(Node, NewNode);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -417,6 +417,7 @@
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -465,6 +465,7 @@
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
 
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
     }
 
@@ -501,6 +502,7 @@
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
       setOperationAction(ISD::FCOPYSIGN, VT, Legal);
 
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
     };
 
@@ -1327,6 +1329,8 @@
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_SEQ_FADD:
     return lowerFPVECREDUCE(Op, DAG);
+  case ISD::INSERT_SUBVECTOR:
+    return lowerINSERT_SUBVECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return lowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::BUILD_VECTOR:
@@ -2234,6 +2238,87 @@
       RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
 }
 
+SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  MVT VecVT = Vec.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // TODO: Only handle scalable->scalable inserts for now, and revisit this for
+  // fixed-length vectors later.
+  if (!SubVecVT.isScalableVector() || !VecVT.isScalableVector())
+    return Op;
+
+  SDLoc DL(Op);
+  unsigned OrigIdx = Op.getConstantOperandVal(2);
+  const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+  unsigned SubRegIdx, RemIdx;
+  std::tie(SubRegIdx, RemIdx) =
+      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+          VecVT, SubVecVT, OrigIdx, TRI);
+
+  RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+  bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 ||
+                         SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
+                         SubVecLMUL == RISCVVLMUL::LMUL_F8;
+
+  // If the Idx has been completely eliminated and this subvector's size is a
+  // vector register or a multiple thereof, or the surrounding elements are
+  // undef, then this is a subvector insert which naturally aligns to a vector
+  // register. These can easily be handled using subregister manipulation.
+  // If the subvector is smaller than a vector register, then the insertion
+  // must preserve the undisturbed elements of the register. We do this by
+  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
+  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
+  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // LMUL=1 type back into the larger vector (resolving to another subregister
+  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
+  // to avoid allocating a large register group to hold our subvector.
+  if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+    return Op;
+
+  // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
+  // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
+  // (in our case undisturbed). This means we can set up a subvector insertion
+  // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
+  // size of the subvector.
+  MVT XLenVT = Subtarget.getXLenVT();
+  MVT InterSubVT = getLMUL1VT(VecVT);
+
+  // Extract a subvector equal to the nearest full vector register type. This
+  // should resolve to a EXTRACT_SUBREG instruction.
+  unsigned AlignedIdx = OrigIdx - RemIdx;
+  SDValue AlignedExtract =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
+                  DAG.getConstant(AlignedIdx, DL, XLenVT));
+
+  SDValue SlideupAmt = DAG.getConstant(RemIdx, DL, XLenVT);
+  // For scalable vectors this must be further multiplied by vscale.
+  SlideupAmt = DAG.getNode(ISD::VSCALE, DL, XLenVT, SlideupAmt);
+
+  SDValue Mask, VL;
+  std::tie(Mask, VL) = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+
+  // Construct the vector length corresponding to RemIdx + length(SubVecVT).
+  VL = DAG.getConstant(SubVecVT.getVectorMinNumElements(), DL, XLenVT);
+  VL = DAG.getNode(ISD::VSCALE, DL, XLenVT, VL);
+  VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
+
+  SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InterSubVT,
+                       DAG.getUNDEF(InterSubVT), SubVec,
+                       DAG.getConstant(0, DL, XLenVT));
+
+  SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, InterSubVT,
+                                AlignedExtract, SubVec, SlideupAmt, Mask, VL);
+
+  // Insert this subvector into the correct vector register. This should
+  // resolve to an INSERT_SUBREG instruction.
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup,
+                     DAG.getConstant(AlignedIdx, DL, XLenVT));
+}
+
 SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
                                                     SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
 ; CHECK-LABEL: insert_nxv8i32_nxv4i32_0:
@@ -181,21 +181,191 @@
   ret <vscale x 16 x i32> %v
 }
 
-; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case
-; we need mask out the unaffected elements (top half of the VR %subvec
-; register)
-;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
-;  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 0)
-;  ret <vscale x 16 x i32> %v
-;}
+define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv1i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 0)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv1i32_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e32,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_6(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv1i32_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v11, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 6)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_0(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v10, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 0)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_1(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 1)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_2(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 2)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v9, v10, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_0(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_nxv2f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 0)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_nxv2f16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_nxv2f16_26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v14, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_undef_nxv1f16_0(<vscale x 1 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8 killed $v8 def $v8m8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv1f16.nxv32f16(<vscale x 32 x half> undef, <vscale x 1 x half> %subvec, i64 0)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_undef_nxv1f16_26(<vscale x 1 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
+; CHECK-NEXT:    vslideup.vx v22, v8, a1
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv1f16.nxv32f16(<vscale x 32 x half> undef, <vscale x 1 x half> %subvec, i64 26)
+  ret <vscale x 32 x half> %v
+}
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8>, <vscale x 1 x i8>, i64)
 
-; TODO: Inserts that don't align to a vector register are not yet supported.
-; In this case we want to insert the subvector into the upper half of the
-; lowest VR subregister in the LMUL group.
-;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
-;  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
-;  ret <vscale x 16 x i32> %v
-;}
+declare <vscale x 32 x half> @llvm.experimental.vector.insert.nxv1f16.nxv32f16(<vscale x 32 x half>, <vscale x 1 x half>, i64)
+declare <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half>, <vscale x 2 x half>, i64)
 
 declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32>, <vscale x 2 x i32>, i64 %idx)
 declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32>, <vscale x 4 x i32>, i64 %idx)