diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3483,8 +3483,13 @@
   SDValue VL = N->getOperand(IsTA ? 1 : 2);
   assert(Src.getResNo() == 0 && "Src should be the first value of a node.");
 
-  // TODO: We should peel off layers of COPY_TO_REGCLASS so we can
-  // handle merges of different vec lengths
+  // We can fold moves of different size reg classes
+  while (Src->isMachineOpcode() &&
+         Src->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) {
+    if (Src->use_empty() || !Src->use_begin()->isOnlyUserOf(Src.getNode()))
+      return false;
+    Src = Src->getOperand(0);
+  }
 
   // Src can only have one user, N.
   if (!Src.hasOneUse())
@@ -3601,7 +3606,14 @@
   else if (N->getGluedNode())
     Ops.push_back(N->getOperand(N->getNumOperands() - 1));
 
-  SDNode *Result = CurDAG->getMachineNode(NewOpc, DL, Src->getVTList(), Ops);
+  // Since we might end up changing the register class, change the vector result
+  // types to be that of the vmv.v.v
+  SmallVector<EVT, 2> NewVTs;
+  NewVTs.push_back(N->getValueType(0));
+  SDVTList SrcVTs = Src->getVTList();
+  for (unsigned I = 1; I < SrcVTs.NumVTs; I++)
+    NewVTs.push_back(SrcVTs.VTs[I]);
+  SDNode *Result = CurDAG->getMachineNode(NewOpc, DL, NewVTs, Ops);
 
   Result->setFlags(Src->getFlags());
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll
@@ -80,10 +80,8 @@
 define <4 x i32> @insert_subvector_load_v4i32_v2i32(<4 x i32> %v1, ptr %p) {
 ; CHECK-LABEL: insert_subvector_load_v4i32_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, tu, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %v2 = load <2 x i32>, ptr %p
   %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -95,10 +93,8 @@
 define <4 x i32> @insert_subvector_vp_load_v4i32_v2i32(<4 x i32> %v1, ptr %p, <2 x i1> %mask) {
 ; CHECK-LABEL: insert_subvector_vp_load_v4i32_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0), v0.t
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; CHECK-NEXT:    vle32.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
   %v2 = call <2 x i32> @llvm.vp.load.v2i32(ptr %p, <2 x i1> %mask, i32 2)
   %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -111,9 +107,8 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v10
 ; CHECK-NEXT:    ret
   %v3 = add <2 x i32> %v2, <i32 0, i32 1>
   %v4 = shufflevector <2 x i32> %v3, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -125,10 +120,8 @@
 define <4 x i32> @insert_subvector_vp_add_v4i32_v2i32(<4 x i32> %v1, <2 x i32> %v2, <2 x i1> %mask) {
 ; CHECK-LABEL: insert_subvector_vp_add_v4i32_v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vadd.vi v9, v9, 1, v0.t
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %v3 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> %v2, <2 x i32> <i32 1, i32 1>, <2 x i1> %mask, i32 2)
   %v4 = shufflevector <2 x i32> %v3, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -112,14 +112,12 @@
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 ; CHECK-LABEL: insert_v4i32_v2i32_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, tu, ma
+; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0)
+; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
@@ -174,14 +172,12 @@
 ;
 ; LMULMAX1-LABEL: insert_v8i32_v2i32_0:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT:    vle32.v v8, (a1)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT:    vle32.v v9, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; LMULMAX1-NEXT:    vmv.v.v v9, v8
+; LMULMAX1-NEXT:    vle32.v v8, (a0)
+; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, tu, ma
+; LMULMAX1-NEXT:    vle32.v v8, (a1)
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT:    vse32.v v9, (a0)
+; LMULMAX1-NEXT:    vse32.v v8, (a0)
 ; LMULMAX1-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
@@ -277,10 +273,8 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, tu, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -430,10 +424,8 @@
 define <vscale x 2 x i16> @insert_nxv2i16_v2i16_0(<vscale x 2 x i16> %v, ptr %svp) {
 ; CHECK-LABEL: insert_nxv2i16_v2i16_0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT:    vle16.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, tu, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    ret
   %sv = load <2 x i16>, ptr %svp
   %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 0)