diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -929,6 +929,7 @@ SDValue SubV = Node->getOperand(1); SDLoc DL(SubV); auto Idx = Node->getConstantOperandVal(2); + MVT XLenVT = Subtarget->getXLenVT(); MVT SubVecVT = Node->getOperand(1).getSimpleValueType(); // TODO: This method of selecting INSERT_SUBVECTOR should work @@ -936,24 +937,6 @@ // correctly identify the canonical register class for fixed-length types. // For now, keep the two paths separate. if (VT.isScalableVector() && SubVecVT.isScalableVector()) { - bool IsFullVecReg = false; - switch (RISCVTargetLowering::getLMUL(SubVecVT)) { - default: - break; - case RISCVVLMUL::LMUL_1: - case RISCVVLMUL::LMUL_2: - case RISCVVLMUL::LMUL_4: - case RISCVVLMUL::LMUL_8: - IsFullVecReg = true; - break; - } - - // If the subvector doesn't occupy a full vector register then we can't - // insert it purely using subregister manipulation. We must not clobber - // the untouched elements (say, in the upper half of the VR register). - if (!IsFullVecReg) - break; - const auto *TRI = Subtarget->getRegisterInfo(); unsigned SubRegIdx; std::tie(SubRegIdx, Idx) = @@ -966,9 +949,32 @@ if (Idx != 0) break; + RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); + bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 || + SubVecLMUL == RISCVVLMUL::LMUL_F4 || + SubVecLMUL == RISCVVLMUL::LMUL_F8; + (void)IsSubVecPartReg; // Silence unused variable warning without asserts. + assert((!IsSubVecPartReg || V.isUndef()) && + "Expecting lowering to have created legal INSERT_SUBVECTORs when " + "the subvector is smaller than a full-sized register"); + + // If we haven't set a SubRegIdx, then we must be going between LMUL<=1 + // types (VR -> VR). This can be done as a copy. + if (SubRegIdx == RISCV::NoSubRegister) { + unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT); + assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecVT) == + RISCV::VRRegClassID && + InRegClassID == RISCV::VRRegClassID && + "Unexpected subvector extraction"); + SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT); + SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + DL, VT, SubV, RC); + return ReplaceNode(Node, NewNode); + } + SDNode *NewNode = CurDAG->getMachineNode( TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV, - CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT())); + CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT)); return ReplaceNode(Node, NewNode); } @@ -981,8 +987,7 @@ unsigned RegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT); - SDValue RC = - CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT()); + SDValue RC = CurDAG->getTargetConstant(RegClassID, DL, XLenVT); SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, SubV, RC); ReplaceNode(Node, NewNode); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -417,6 +417,7 @@ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -465,6 +465,7 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } @@ -501,6 +502,7 @@ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); }; @@ -1327,6 +1329,8 @@ case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SEQ_FADD: return lowerFPVECREDUCE(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return lowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::BUILD_VECTOR: @@ -2234,6 +2238,87 @@ RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits()); } +SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + MVT VecVT = Vec.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + + // TODO: Only handle scalable->scalable inserts for now, and revisit this for + // fixed-length vectors later. + if (!SubVecVT.isScalableVector() || !VecVT.isScalableVector()) + return Op; + + SDLoc DL(Op); + unsigned OrigIdx = Op.getConstantOperandVal(2); + const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); + + unsigned SubRegIdx, RemIdx; + std::tie(SubRegIdx, RemIdx) = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + VecVT, SubVecVT, OrigIdx, TRI); + + RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT); + bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 || + SubVecLMUL == RISCVVLMUL::LMUL_F4 || + SubVecLMUL == RISCVVLMUL::LMUL_F8; + + // If the Idx has been completely eliminated and this subvector's size is a + // vector register or a multiple thereof, or the surrounding elements are + // undef, then this is a subvector insert which naturally aligns to a vector + // register. These can easily be handled using subregister manipulation. + // If the subvector is smaller than a vector register, then the insertion + // must preserve the undisturbed elements of the register. We do this by + // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type + // (which resolves to a subregister copy), performing a VSLIDEUP to place the + // subvector within the vector register, and an INSERT_SUBVECTOR of that + // LMUL=1 type back into the larger vector (resolving to another subregister + // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type + // to avoid allocating a large register group to hold our subvector. + if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef())) + return Op; + + // VSLIDEUP works by leaving elements 0 @insert_nxv8i32_nxv4i32_0( %vec, %subvec) { ; CHECK-LABEL: insert_nxv8i32_nxv4i32_0: @@ -181,21 +181,191 @@ ret %v } -; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case -; we need mask out the unaffected elements (top half of the VR %subvec -; register) -;define @insert_nxv16i32_nxv1i32_0( %vec, %subvec) { -; %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 0) -; ret %v -;} +define @insert_nxv16i32_nxv1i32_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv1i32_0: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a0, a0, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v8, v16, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i32_nxv1i32_1( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv1i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli a1, a1, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 1) + ret %v +} + +define @insert_nxv16i32_nxv1i32_6( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i32_nxv1i32_6: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a0, a0, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v11, v16, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 6) + ret %v +} + +define @insert_nxv16i8_nxv1i8_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_0: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a0, a0, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vi v8, v10, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv16i8_nxv1i8_1( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_1: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 1) + ret %v +} + +define @insert_nxv16i8_nxv1i8_2( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_2: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a0, a0, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 2) + ret %v +} + +define @insert_nxv16i8_nxv1i8_3( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a1, a1, a0 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a0, a0, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 3) + ret %v +} + +define @insert_nxv16i8_nxv1i8_7( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_7: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 7) + ret %v +} + +define @insert_nxv16i8_nxv1i8_15( %vec, %subvec) { +; CHECK-LABEL: insert_nxv16i8_nxv1i8_15: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: vsetvli a1, a1, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vx v9, v10, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1i8.nxv16i8( %vec, %subvec, i64 15) + ret %v +} + +define @insert_nxv32f16_nxv2f16_0( %vec, %subvec) { +; CHECK-LABEL: insert_nxv32f16_nxv2f16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a0, a0, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vi v8, v16, 0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2f16.nxv32f16( %vec, %subvec, i64 0) + ret %v +} + +define @insert_nxv32f16_nxv2f16_2( %vec, %subvec) { +; CHECK-LABEL: insert_nxv32f16_nxv2f16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli a1, a1, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vx v8, v16, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2f16.nxv32f16( %vec, %subvec, i64 2) + ret %v +} + +define @insert_nxv32f16_nxv2f16_26( %vec, %subvec) { +; CHECK-LABEL: insert_nxv32f16_nxv2f16_26: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli a1, a1, e16,m1,tu,mu +; CHECK-NEXT: vslideup.vx v14, v16, a0 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv2f16.nxv32f16( %vec, %subvec, i64 26) + ret %v +} + +define @insert_nxv32f16_undef_nxv1f16_0( %subvec) { +; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m8 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1f16.nxv32f16( undef, %subvec, i64 0) + ret %v +} + +define @insert_nxv32f16_undef_nxv1f16_26( %subvec) { +; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_26: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a0, a0, e16,m1,ta,mu +; CHECK-NEXT: vslideup.vx v22, v8, a1 +; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: ret + %v = call @llvm.experimental.vector.insert.nxv1f16.nxv32f16( undef, %subvec, i64 26) + ret %v +} + +declare @llvm.experimental.vector.insert.nxv1i8.nxv16i8(, , i64) -; TODO: Inserts that don't align to a vector register are not yet supported. -; In this case we want to insert the subvector into the upper half of the -; lowest VR subregister in the LMUL group. -;define @insert_nxv16i32_nxv1i32_1( %vec, %subvec) { -; %v = call @llvm.experimental.vector.insert.nxv1i32.nxv16i32( %vec, %subvec, i64 1) -; ret %v -;} +declare @llvm.experimental.vector.insert.nxv1f16.nxv32f16(, , i64) +declare @llvm.experimental.vector.insert.nxv2f16.nxv32f16(, , i64) declare @llvm.experimental.vector.insert.nxv2i32.nxv8i32(, , i64 %idx) declare @llvm.experimental.vector.insert.nxv4i32.nxv8i32(, , i64 %idx)