diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3483,8 +3483,13 @@ SDValue VL = N->getOperand(IsTA ? 1 : 2); assert(Src.getResNo() == 0 && "Src should be the first value of a node."); - // TODO: We should peel off layers of COPY_TO_REGCLASS so we can - // handle merges of different vec lengths + // We can fold moves of different size reg classes + while (Src->isMachineOpcode() && + Src->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) { + if (Src->use_empty() || !Src->use_begin()->isOnlyUserOf(Src.getNode())) + return false; + Src = Src->getOperand(0); + } // Src can only have one user, N. if (!Src.hasOneUse()) @@ -3601,7 +3606,14 @@ else if (N->getGluedNode()) Ops.push_back(N->getOperand(N->getNumOperands() - 1)); - SDNode *Result = CurDAG->getMachineNode(NewOpc, DL, Src->getVTList(), Ops); + // Since we might end up changing the register class, change the vector result + // types to be that of the vmv.v.v + SmallVector NewVTs; + NewVTs.push_back(N->getValueType(0)); + SDVTList SrcVTs = Src->getVTList(); + for (unsigned I = 1; I < SrcVTs.NumVTs; I++) + NewVTs.push_back(SrcVTs.VTs[I]); + SDNode *Result = CurDAG->getMachineNode(NewOpc, DL, NewVTs, Ops); Result->setFlags(Src->getFlags()); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll @@ -80,10 +80,8 @@ define <4 x i32> @insert_subvector_load_v4i32_v2i32(<4 x i32> %v1, ptr %p) { ; CHECK-LABEL: insert_subvector_load_v4i32_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v2 = load <2 x i32>, ptr %p %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> @@ -95,10 +93,8 @@ define <4 x i32> @insert_subvector_vp_load_v4i32_v2i32(<4 x i32> %v1, ptr %p, <2 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_load_v4i32_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: ret %v2 = call <2 x i32> @llvm.vp.load.v2i32(ptr %p, <2 x i1> %mask, i32 2) %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> @@ -111,9 +107,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v3 = add <2 x i32> %v2, %v4 = shufflevector <2 x i32> %v3, <2 x i32> poison, <4 x i32> @@ -125,10 +120,8 @@ define <4 x i32> @insert_subvector_vp_add_v4i32_v2i32(<4 x i32> %v1, <2 x i32> %v2, <2 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_add_v4i32_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vi v9, v9, 1, v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vadd.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %v3 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> %v2, <2 x i32> , <2 x i1> %mask, i32 2) %v4 = shufflevector <2 x i32> %v3, <2 x i32> poison, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -112,14 +112,12 @@ define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) { ; CHECK-LABEL: insert_v4i32_v2i32_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v9, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <4 x i32>, ptr %vp @@ -174,14 +172,12 @@ ; ; LMULMAX1-LABEL: insert_v8i32_v2i32_0: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vle32.v v8, (a1) ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; LMULMAX1-NEXT: vmv.v.v v9, v8 +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; LMULMAX1-NEXT: vle32.v v8, (a1) ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vse32.v v9, (a0) +; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <8 x i32>, ptr %vp @@ -277,10 +273,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, ma +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret @@ -430,10 +424,8 @@ define @insert_nxv2i16_v2i16_0( %v, ptr %svp) { ; CHECK-LABEL: insert_nxv2i16_v2i16_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: ret %sv = load <2 x i16>, ptr %svp %c = call @llvm.vector.insert.v2i16.nxv2i16( %v, <2 x i16> %sv, i64 0)