diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -655,6 +655,7 @@ SDValue SubV = Node->getOperand(1); SDLoc DL(SubV); auto Idx = Node->getConstantOperandVal(2); + MVT XLenVT = Subtarget->getXLenVT(); MVT SubVecVT = Node->getOperand(1).getSimpleValueType(); // TODO: This method of selecting INSERT_SUBVECTOR should work @@ -662,23 +663,10 @@ // correctly identify the canonical register class for fixed-length types. // For now, keep the two paths separate. if (VT.isScalableVector() && SubVecVT.isScalableVector()) { - bool IsFullVecReg = false; - switch (RISCVTargetLowering::getLMUL(SubVecVT)) { - default: - break; - case RISCVVLMUL::LMUL_1: - case RISCVVLMUL::LMUL_2: - case RISCVVLMUL::LMUL_4: - case RISCVVLMUL::LMUL_8: - IsFullVecReg = true; - break; - } - - // If the subvector doesn't occupy a full vector register then we can't - // insert it purely using subregister manipulation. We must not clobber - // the untouched elements (say, in the upper half of the VR register). - if (!IsFullVecReg) - break; + RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); + bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 || + SubVecLMUL == RISCVVLMUL::LMUL_F4 || + SubVecLMUL == RISCVVLMUL::LMUL_F8; const auto *TRI = Subtarget->getRegisterInfo(); unsigned SubRegIdx; @@ -692,9 +680,42 @@ if (Idx != 0) break; + if (IsSubVecPartReg) { + // If the subvector doesn't occupy a full vector register then we can't + // insert it purely using subregister manipulation. We must not clobber + // the untouched elements (say, in the upper half of the VR register). + // Accomplish this using a vslideup at element 0 which uses tail + // undisturbed and thus preserves the existing vector elements. + assert(Idx == 0 && + RISCVTargetLowering::getLMUL(VT) == RISCVVLMUL::LMUL_1 && + "Expecting lowering to have created legal INSERT_SUBVECTORs"); + unsigned VSlideupOpc = 0; + switch (SubVecLMUL) { + default: + llvm_unreachable("Impossible LMUL"); + case RISCVVLMUL::LMUL_F2: + VSlideupOpc = RISCV::PseudoVSLIDEUP_VI_MF2; + break; + case RISCVVLMUL::LMUL_F4: + VSlideupOpc = RISCV::PseudoVSLIDEUP_VI_MF4; + break; + case RISCVVLMUL::LMUL_F8: + VSlideupOpc = RISCV::PseudoVSLIDEUP_VI_MF8; + break; + } + SDValue VLMax = CurDAG->getRegister(RISCV::X0, XLenVT); + SDValue SEW = + CurDAG->getTargetConstant(VT.getScalarSizeInBits(), DL, XLenVT); + SDNode *NewNode = CurDAG->getMachineNode( + VSlideupOpc, DL, VT, + {V, SubV, CurDAG->getTargetConstant(0, DL, XLenVT), VLMax, SEW}); + return ReplaceNode(Node, NewNode); + } + + assert(SubRegIdx != RISCV::NoSubRegister && "Invalid insert"); SDNode *NewNode = CurDAG->getMachineNode( TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV, - CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT())); + CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT)); return ReplaceNode(Node, NewNode); } @@ -707,8 +728,7 @@ unsigned RegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT); - SDValue RC = - CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT()); + SDValue RC = CurDAG->getTargetConstant(RegClassID, DL, XLenVT); SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, SubV, RC); ReplaceNode(Node, NewNode); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -417,6 +417,7 @@ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -465,6 +465,7 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } @@ -1303,6 +1304,8 @@ case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SEQ_FADD: return lowerFPVECREDUCE(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return lowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::BUILD_VECTOR: @@ -2211,6 +2214,87 @@ 64 / VT.getVectorElementType().getSizeInBits()); } +SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + MVT VecVT = Vec.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // TODO: Only handle scalable->scalable inserts for now, and revisit this for + // fixed-length vectors later. + if (!SubVecVT.isScalableVector() || !VecVT.isScalableVector()) + return Op; + + SDLoc DL(Op); + auto OrigIdx = Op.getConstantOperandVal(2); + const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + unsigned SubRegIdx, RemIdx; + std::tie(SubRegIdx, RemIdx) = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + VecVT, SubVecVT, OrigIdx, TRI); + + RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); + bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 || + SubVecLMUL == RISCVVLMUL::LMUL_F4 || + SubVecLMUL == RISCVVLMUL::LMUL_F8; + + // If the Idx has been completely eliminated and this subvector's size is a + // multiple of a vector register, then this is a subvector insert which + // naturally aligns to a vector register. These can easily be handled using + // subregister manipulation. + // If the subvector is smaller than a vector register, then the insertion + // must preserve the undisturbed elements of the register. We do this by + // lowering to a INSERT_SUBVECTOR into a single (LMUL=1) vector register. + MVT InterSubVT = getLMUL1VT(VecVT); + if (RemIdx == 0 && (!IsSubVecPartReg || VecVT == InterSubVT)) + return Op; + + // We must shift our vector register directly to insert the subvector. Do + // this using VSLIDEDOWN. + MVT XLenVT = Subtarget.getXLenVT(); + + // Extract a subvector equal to the nearest full vector register type. This + // should resolve to a EXTRACT_SUBREG instruction. + unsigned AlignedIdx = OrigIdx - RemIdx; + SDValue AlignedExtract = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec, + DAG.getConstant(AlignedIdx, DL, XLenVT)); + + // Slide this vector register down by the desired number of elements in order + // to place the desired subvector starting at element 0. + SDValue SlidedownAmt = DAG.getConstant(RemIdx, DL, XLenVT); + // For scalable vectors this must be further multiplied by vscale. + SlidedownAmt = DAG.getNode(ISD::VSCALE, DL, XLenVT, SlidedownAmt); + + SDValue Mask, VL; + std::tie(Mask, VL) = getDefaultScalableVLOps(InterSubVT, DL, DAG, Subtarget); + SDValue Slidedown = AlignedExtract; + if (RemIdx != 0) + Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, InterSubVT, + DAG.getUNDEF(InterSubVT), AlignedExtract, + SlidedownAmt, Mask, VL); + + // The actual insertion into the vector register. This is either a COPY for a + // full-sized type, or a "no-op" VSLIDEUP which preserves the untouched upper + // elements. + SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InterSubVT, Slidedown, + SubVec, DAG.getConstant(0, DL, XLenVT)); + + // Slide the LMUL=1 subvector back up into place so that the sub-subvector is + // in the correct position. + SDValue Slideup = Insert; + if (RemIdx != 0) + Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, InterSubVT, AlignedExtract, + Insert, SlidedownAmt, Mask, VL); + + // Insert this subvector into the correct vector register. This should + // resolve to an INSERT_SUBREG instruction. + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup, + DAG.getConstant(AlignedIdx, DL, XLenVT)); +} + SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple riscv64 -mattr=+m,+experimental-v -verify-machineinstrs < %s | FileCheck %s define @insert_nxv8i32_nxv4i32_0( %vec, %subvec) { ; CHECK-LABEL: insert_nxv8i32_nxv4i32_0: @@ -181,21 +181,121 @@ ret %v } -; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case -; we need mask out the unaffected elements (top half of the VR %subvec -; register) -;define @insert_nxv16i32_nxv1i32_0( %vec, %subvec) { -; %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 0) -; ret %v -;} +define @insert_nxv16i32_nxv1i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv1i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v8, v16, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i32_nxv1i32_1( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv1i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu +; CHECK-NEXT: vslidedown.vx v25, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e32,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v25, v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v25, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 1) + ret %v +} + +define @insert_nxv16i32_nxv1i32_6( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv1i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v11, v16, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 6) + ret %v +} + +define @insert_nxv16i8_nxv1i8_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v8, v10, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i8_nxv1i8_1( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_1: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vslidedown.vx v25, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v25, v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v25, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 1) + ret %v +} + +define @insert_nxv16i8_nxv1i8_2( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_2: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vslidedown.vx v25, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v25, v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v25, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 2) + ret %v +} + +define @insert_nxv16i8_nxv1i8_3( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vslidedown.vx v25, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v25, v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v25, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 3) + ret %v +} + +define @insert_nxv16i8_nxv1i8_7( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_7: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vslidedown.vx v25, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v25, v10, 0 +; CHECK-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v25, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 7) + ret %v +} -; TODO: Inserts that don't align to a vector register are not yet supported. -; In this case we want to insert the subvector into the upper half of the -; lowest VR subregister in the LMUL group. -;define @insert_nxv16i32_nxv1i32_1( %vec, %subvec) { -; %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 1) -; ret %v -;} +declare @llvm.experimental.vector.insert.nxv1i8.nxv16i8(, , i64) declare @llvm.experimental.vector.insert.nxv2i32.nxv8i32(, , i64 %idx) declare @llvm.experimental.vector.insert.nxv4i32.nxv8i32(, , i64 %idx)