diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -945,68 +945,53 @@ auto Idx = Node->getConstantOperandVal(2); MVT SubVecVT = SubV.getSimpleValueType(); - // TODO: This method of selecting INSERT_SUBVECTOR should work - // with any type of insertion (fixed <-> scalable) but we don't yet - // correctly identify the canonical register class for fixed-length types. - // For now, keep the two paths separate. - if (VT.isScalableVector() && SubVecVT.isScalableVector()) { - const auto *TRI = Subtarget->getRegisterInfo(); - unsigned SubRegIdx; - std::tie(SubRegIdx, Idx) = - RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( - VT, SubVecVT, Idx, TRI); - - // If the Idx hasn't been completely eliminated then this is a subvector - // insert which doesn't naturally align to a vector register. These must - // be handled using instructions to manipulate the vector registers. - if (Idx != 0) - break; - - RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); - bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 || - SubVecLMUL == RISCVVLMUL::LMUL_F4 || - SubVecLMUL == RISCVVLMUL::LMUL_F8; - (void)IsSubVecPartReg; // Silence unused variable warning without asserts. - assert((!IsSubVecPartReg || V.isUndef()) && - "Expecting lowering to have created legal INSERT_SUBVECTORs when " - "the subvector is smaller than a full-sized register"); - - // If we haven't set a SubRegIdx, then we must be going between LMUL<=1 - // types (VR -> VR). This can be done as a copy. - if (SubRegIdx == RISCV::NoSubRegister) { - unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT); - assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecVT) == - RISCV::VRRegClassID && - InRegClassID == RISCV::VRRegClassID && - "Unexpected subvector extraction"); - SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT); - SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - DL, VT, SubV, RC); - return ReplaceNode(Node, NewNode); - } + MVT SubVecContainerVT = SubVecVT; + // Establish the correct scalable-vector types for any fixed-length type. + if (SubVecVT.isFixedLengthVector()) + SubVecContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector( + *CurDAG, SubVecVT, *Subtarget); + if (VT.isFixedLengthVector()) + VT = RISCVTargetLowering::getContainerForFixedLengthVector(*CurDAG, VT, + *Subtarget); - SDNode *NewNode = CurDAG->getMachineNode( - TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV, - CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT)); - return ReplaceNode(Node, NewNode); - } + const auto *TRI = Subtarget->getRegisterInfo(); + unsigned SubRegIdx; + std::tie(SubRegIdx, Idx) = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + VT, SubVecContainerVT, Idx, TRI); - if (VT.isScalableVector() && SubVecVT.isFixedLengthVector()) { - // Bail when not a "cast" like insert_subvector. - if (Idx != 0) - break; - if (!Node->getOperand(0).isUndef()) - break; + // If the Idx hasn't been completely eliminated then this is a subvector + // insert which doesn't naturally align to a vector register. These must + // be handled using instructions to manipulate the vector registers. + if (Idx != 0) + break; - unsigned RegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT); + RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); + bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 || + SubVecLMUL == RISCVVLMUL::LMUL_F4 || + SubVecLMUL == RISCVVLMUL::LMUL_F8; + (void)IsSubVecPartReg; // Silence unused variable warning without asserts. + assert((!IsSubVecPartReg || V.isUndef()) && + "Expecting lowering to have created legal INSERT_SUBVECTORs when " + "the subvector is smaller than a full-sized register"); - SDValue RC = CurDAG->getTargetConstant(RegClassID, DL, XLenVT); + // If we haven't set a SubRegIdx, then we must be going between + // equally-sized LMUL groups (e.g. VR -> VR). This can be done as a copy. + if (SubRegIdx == RISCV::NoSubRegister) { + unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT); + assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecContainerVT) == + InRegClassID && + "Unexpected subvector extraction"); + SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT); SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, SubV, RC); - ReplaceNode(Node, NewNode); - return; + return ReplaceNode(Node, NewNode); } - break; + + SDNode *NewNode = CurDAG->getMachineNode( + TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV, + CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT)); + return ReplaceNode(Node, NewNode); } case ISD::EXTRACT_SUBVECTOR: { SDValue V = Node->getOperand(0); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -531,6 +531,7 @@ setTruncStoreAction(VT, OtherVT, Expand); // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed. + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -602,6 +603,7 @@ } // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed. + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -2436,15 +2438,42 @@ MVT VecVT = Vec.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); - // TODO: Only handle scalable->scalable inserts for now, and revisit this for - // fixed-length vectors later. - if (!SubVecVT.isScalableVector() || !VecVT.isScalableVector()) - return Op; - SDLoc DL(Op); + MVT XLenVT = Subtarget.getXLenVT(); unsigned OrigIdx = Op.getConstantOperandVal(2); const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); + // If the subvector vector is a fixed-length type, we cannot use subregister + // manipulation to simplify the codegen; we don't know which register of a + // LMUL group contains the specific subvector as we only know the minimum + // register size. Therefore we must slide the vector group up the full + // amount. + if (SubVecVT.isFixedLengthVector()) { + if (OrigIdx == 0 && Vec.isUndef()) + return Op; + MVT ContainerVT = VecVT; + if (VecVT.isFixedLengthVector()) { + ContainerVT = RISCVTargetLowering::getContainerForFixedLengthVector( + DAG, VecVT, Subtarget); + Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + } + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), SubVec, + DAG.getConstant(0, DL, XLenVT)); + SDValue Mask = + getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first; + // Set the vector length to only the number of elements we care about. Note + // that for slideup this includes the offset. + SDValue VL = + DAG.getConstant(OrigIdx + SubVecVT.getVectorNumElements(), DL, XLenVT); + SDValue SlideupAmt = DAG.getConstant(OrigIdx, DL, XLenVT); + SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec, + SubVec, SlideupAmt, Mask, VL); + if (!VecVT.isFixedLengthVector()) + return Slideup; + return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget); + } + unsigned SubRegIdx, RemIdx; std::tie(SubRegIdx, RemIdx) = RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( @@ -2455,11 +2484,11 @@ SubVecLMUL == RISCVVLMUL::LMUL_F4 || SubVecLMUL == RISCVVLMUL::LMUL_F8; - // If the Idx has been completely eliminated and this subvector's size is a - // vector register or a multiple thereof, or the surrounding elements are + // 1. If the Idx has been completely eliminated and this subvector's size is + // a vector register or a multiple thereof, or the surrounding elements are // undef, then this is a subvector insert which naturally aligns to a vector // register. These can easily be handled using subregister manipulation. - // If the subvector is smaller than a vector register, then the insertion + // 2. If the subvector is smaller than a vector register, then the insertion // must preserve the undisturbed elements of the register. We do this by // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type // (which resolves to a subregister copy), performing a VSLIDEUP to place the @@ -2475,7 +2504,6 @@ // (in our case undisturbed). This means we can set up a subvector insertion // where OFFSET is the insertion offset, and the VL is the OFFSET plus the // size of the subvector. - MVT XLenVT = Subtarget.getXLenVT(); MVT InterSubVT = getLMUL1VT(VecVT); // Extract a subvector equal to the nearest full vector register type. This diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -0,0 +1,331 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 + +define @insert_nxv8i32_v2i32_0( %vec, <2 x i32>* %svp) { +; CHECK-LABEL: insert_nxv8i32_v2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v28, (a0) +; CHECK-NEXT: vsetivli a0, 2, e32,m4,tu,mu +; CHECK-NEXT: vslideup.vi v8, v28, 0 +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 0) + ret %v +} + +define @insert_nxv8i32_v2i32_2( %vec, <2 x i32>* %svp) { +; CHECK-LABEL: insert_nxv8i32_v2i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v28, (a0) +; CHECK-NEXT: vsetivli a0, 4, e32,m4,tu,mu +; CHECK-NEXT: vslideup.vi v8, v28, 2 +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 2) + ret %v +} + +define @insert_nxv8i32_v2i32_6( %vec, <2 x i32>* %svp) { +; CHECK-LABEL: insert_nxv8i32_v2i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v28, (a0) +; CHECK-NEXT: vsetivli a0, 8, e32,m4,tu,mu +; CHECK-NEXT: vslideup.vi v8, v28, 6 +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 6) + ret %v +} + +define @insert_nxv8i32_v8i32_0( %vec, <8 x i32>* %svp) { +; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: vsetivli a0, 8, e32,m4,tu,mu +; LMULMAX2-NEXT: vslideup.vi v8, v28, 0 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v28, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetivli a0, 4, e32,m4,tu,mu +; LMULMAX1-NEXT: vslideup.vi v8, v28, 0 +; LMULMAX1-NEXT: vsetivli a0, 8, e32,m4,tu,mu +; LMULMAX1-NEXT: vslideup.vi v8, v12, 4 +; LMULMAX1-NEXT: ret + %sv = load <8 x i32>, <8 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 0) + ret %v +} + +define @insert_nxv8i32_v8i32_4( %vec, <8 x i32>* %svp) { +; LMULMAX2-LABEL: insert_nxv8i32_v8i32_4: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: vsetivli a0, 12, e32,m4,tu,mu +; LMULMAX2-NEXT: vslideup.vi v8, v28, 4 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_nxv8i32_v8i32_4: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v28, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetivli a0, 8, e32,m4,tu,mu +; LMULMAX1-NEXT: vslideup.vi v8, v28, 4 +; LMULMAX1-NEXT: vsetivli a0, 12, e32,m4,tu,mu +; LMULMAX1-NEXT: vslideup.vi v8, v12, 8 +; LMULMAX1-NEXT: ret + %sv = load <8 x i32>, <8 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 4) + ret %v +} + +define @insert_nxv8i32_v8i32_8( %vec, <8 x i32>* %svp) { +; LMULMAX2-LABEL: insert_nxv8i32_v8i32_8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: vsetivli a0, 16, e32,m4,tu,mu +; LMULMAX2-NEXT: vslideup.vi v8, v28, 8 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_nxv8i32_v8i32_8: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v28, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v12, (a0) +; LMULMAX1-NEXT: vsetivli a0, 12, e32,m4,tu,mu +; LMULMAX1-NEXT: vslideup.vi v8, v28, 8 +; LMULMAX1-NEXT: vsetivli a0, 16, e32,m4,tu,mu +; LMULMAX1-NEXT: vslideup.vi v8, v12, 12 +; LMULMAX1-NEXT: ret + %sv = load <8 x i32>, <8 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 8) + ret %v +} + +define @insert_nxv8i32_undef_v2i32_0(<2 x i32>* %svp) { +; CHECK-LABEL: insert_nxv8i32_undef_v2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %v = call @llvm.experimental.vector.insert.v2i32.nxv8i32( undef, <2 x i32> %sv, i64 0) + ret %v +} + +define void @insert_v4i32_v2i32_0(<4 x i32>* %vp, <2 x i32>* %svp) { +; CHECK-LABEL: insert_v4i32_v2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a1) +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v26, (a0) +; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 0 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v26, (a0) +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %vec = load <4 x i32>, <4 x i32>* %vp + %v = call <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0) + store <4 x i32> %v, <4 x i32>* %vp + ret void +} + +define void @insert_v4i32_v2i32_2(<4 x i32>* %vp, <2 x i32>* %svp) { +; CHECK-LABEL: insert_v4i32_v2i32_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a1) +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v26, (a0) +; CHECK-NEXT: vsetivli a1, 4, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 2 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v26, (a0) +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %vec = load <4 x i32>, <4 x i32>* %vp + %v = call <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2) + store <4 x i32> %v, <4 x i32>* %vp + ret void +} + +define void @insert_v4i32_undef_v2i32_0(<4 x i32>* %vp, <2 x i32>* %svp) { +; CHECK-LABEL: insert_v4i32_undef_v2i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a1) +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 0 +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vse32.v v26, (a0) +; CHECK-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %v = call <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0) + store <4 x i32> %v, <4 x i32>* %vp + ret void +} + +define void @insert_v8i32_v2i32_0(<8 x i32>* %vp, <2 x i32>* %svp) { +; LMULMAX2-LABEL: insert_v8i32_v2i32_0: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a1) +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: vsetivli a1, 2, e32,m2,tu,mu +; LMULMAX2-NEXT: vslideup.vi v28, v26, 0 +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vse32.v v28, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_v8i32_v2i32_0: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v26, (a0) +; LMULMAX1-NEXT: vsetivli a1, 2, e32,m1,tu,mu +; LMULMAX1-NEXT: vslideup.vi v26, v25, 0 +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vse32.v v26, (a0) +; LMULMAX1-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %vec = load <8 x i32>, <8 x i32>* %vp + %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0) + store <8 x i32> %v, <8 x i32>* %vp + ret void +} + +define void @insert_v8i32_v2i32_2(<8 x i32>* %vp, <2 x i32>* %svp) { +; LMULMAX2-LABEL: insert_v8i32_v2i32_2: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a1) +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: vsetivli a1, 4, e32,m2,tu,mu +; LMULMAX2-NEXT: vslideup.vi v28, v26, 2 +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vse32.v v28, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_v8i32_v2i32_2: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v26, (a0) +; LMULMAX1-NEXT: vse32.v v26, (sp) +; LMULMAX1-NEXT: addi a1, sp, 8 +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vse32.v v25, (a1) +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (sp) +; LMULMAX1-NEXT: vse32.v v25, (a0) +; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %vec = load <8 x i32>, <8 x i32>* %vp + %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2) + store <8 x i32> %v, <8 x i32>* %vp + ret void +} + +define void @insert_v8i32_v2i32_6(<8 x i32>* %vp, <2 x i32>* %svp) { +; LMULMAX2-LABEL: insert_v8i32_v2i32_6: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a1) +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v28, (a0) +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,tu,mu +; LMULMAX2-NEXT: vslideup.vi v28, v26, 6 +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vse32.v v28, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_v8i32_v2i32_6: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v26, (a0) +; LMULMAX1-NEXT: addi a1, sp, 16 +; LMULMAX1-NEXT: vse32.v v26, (a1) +; LMULMAX1-NEXT: addi a2, sp, 24 +; LMULMAX1-NEXT: vsetivli a3, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vse32.v v25, (a2) +; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: vse32.v v25, (a0) +; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %vec = load <8 x i32>, <8 x i32>* %vp + %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6) + store <8 x i32> %v, <8 x i32>* %vp + ret void +} + +define void @insert_v8i32_undef_v2i32_6(<8 x i32>* %vp, <2 x i32>* %svp) { +; LMULMAX2-LABEL: insert_v8i32_undef_v2i32_6: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a1) +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vslideup.vi v28, v26, 6 +; LMULMAX2-NEXT: vse32.v v28, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_v8i32_undef_v2i32_6: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -32 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: addi a1, sp, 24 +; LMULMAX1-NEXT: vse32.v v25, (a1) +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (sp) +; LMULMAX1-NEXT: addi a1, sp, 16 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: vse32.v v25, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vse32.v v26, (a0) +; LMULMAX1-NEXT: addi sp, sp, 32 +; LMULMAX1-NEXT: ret + %sv = load <2 x i32>, <2 x i32>* %svp + %v = call <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6) + store <8 x i32> %v, <8 x i32>* %vp + ret void +} + +declare <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64) +declare <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64) + +declare @llvm.experimental.vector.insert.v2i32.nxv8i32(, <2 x i32>, i64) +declare @llvm.experimental.vector.insert.v4i32.nxv8i32(, <4 x i32>, i64) +declare @llvm.experimental.vector.insert.v8i32.nxv8i32(, <8 x i32>, i64)