diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -419,6 +419,7 @@ // Mask VTs are custom-expanded into a series of standard nodes setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } @@ -2443,6 +2444,43 @@ unsigned OrigIdx = Op.getConstantOperandVal(2); const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo(); + // We don't have the ability to slide mask vectors up indexed by their i1 + // elements; the smallest we can do is i8. Often we are able to bitcast to + // equivalent i8 vectors. Note that when inserting a fixed-length vector + // into a scalable one, we might not necessarily have enough scalable + // elements to safely divide by 8: nxv1i1 = insert nxv1i1, v4i1 is valid. + if (SubVecVT.getVectorElementType() == MVT::i1 && + (OrigIdx != 0 || !Vec.isUndef())) { + if (VecVT.getVectorMinNumElements() >= 8 && + SubVecVT.getVectorMinNumElements() >= 8) { + assert(OrigIdx % 8 == 0 && "Invalid index"); + assert(VecVT.getVectorMinNumElements() % 8 == 0 && + SubVecVT.getVectorMinNumElements() % 8 == 0 && + "Unexpected mask vector lowering"); + OrigIdx /= 8; + SubVecVT = + MVT::getVectorVT(MVT::i8, SubVecVT.getVectorMinNumElements() / 8, + SubVecVT.isScalableVector()); + VecVT = MVT::getVectorVT(MVT::i8, VecVT.getVectorMinNumElements() / 8, + VecVT.isScalableVector()); + Vec = DAG.getBitcast(VecVT, Vec); + SubVec = DAG.getBitcast(SubVecVT, SubVec); + } else { + // We can't slide this mask vector up indexed by its i1 elements. + // This poses a problem when we wish to insert a scalable vector which + // can't be re-expressed as a larger type. Just choose the slow path and + // extend to a larger type, then truncate back down. + MVT ExtVecVT = VecVT.changeVectorElementType(MVT::i8); + MVT ExtSubVecVT = SubVecVT.changeVectorElementType(MVT::i8); + Vec = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVecVT, Vec); + SubVec = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtSubVecVT, SubVec); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ExtVecVT, Vec, SubVec, + Op.getOperand(2)); + SDValue SplatZero = DAG.getConstant(0, DL, ExtVecVT); + return DAG.getSetCC(DL, VecVT, Vec, SplatZero, ISD::SETNE); + } + } + // If the subvector vector is a fixed-length type, we cannot use subregister // manipulation to simplify the codegen; we don't know which register of a // LMUL group contains the specific subvector as we only know the minimum @@ -2539,6 +2577,12 @@ if (VecVT.bitsGT(InterSubVT)) Slideup = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup, DAG.getConstant(AlignedIdx, DL, XLenVT)); + + // We might have bitcast from a mask type: cast back to the original type if + // required. + if (VecVT != Op.getSimpleValueType()) + Slideup = DAG.getBitcast(Op.getSimpleValueType(), Slideup); + return Slideup; } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -361,6 +361,100 @@ ret void } +define void @insert_v32i1_v8i1_0(<32 x i1>* %vp, <8 x i1>* %svp) { +; LMULMAX2-LABEL: insert_v32i1_v8i1_0: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a3, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle1.v v25, (a0) +; LMULMAX2-NEXT: vsetivli a3, 8, e8,m1,ta,mu +; LMULMAX2-NEXT: vle1.v v26, (a1) +; LMULMAX2-NEXT: vsetivli a1, 1, e8,m1,tu,mu +; LMULMAX2-NEXT: vslideup.vi v25, v26, 0 +; LMULMAX2-NEXT: vsetvli a1, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vse1.v v25, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: insert_v32i1_v8i1_0: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: vle1.v v25, (a0) +; LMULMAX1-NEXT: vsetivli a2, 8, e8,m1,ta,mu +; LMULMAX1-NEXT: vle1.v v26, (a1) +; LMULMAX1-NEXT: vsetivli a1, 1, e8,m1,tu,mu +; LMULMAX1-NEXT: vslideup.vi v25, v26, 0 +; LMULMAX1-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: vse1.v v25, (a0) +; LMULMAX1-NEXT: ret + %v = load <32 x i1>, <32 x i1>* %vp + %sv = load <8 x i1>, <8 x i1>* %svp + %c = call <32 x i1> @llvm.experimental.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0) + store <32 x i1> %c, <32 x i1>* %vp + ret void +} + +; FIXME: SplitVecRes_INSERT_SUBVECTOR crashes on this one when trying to spill +; to the stack. +;define void @insert_v32i1_v8i1_16(<32 x i1>* %vp, <8 x i1>* %svp) { +; %v = load <32 x i1>, <32 x i1>* %vp +; %sv = load <8 x i1>, <8 x i1>* %svp +; %c = call <32 x i1> @llvm.experimental.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16) +; store <32 x i1> %c, <32 x i1>* %vp +; ret void +;} + +define void @insert_v8i1_v4i1_0(<8 x i1>* %vp, <4 x i1>* %svp) { +; CHECK-LABEL: insert_v8i1_v4i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 8, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v27, (a0) +; CHECK-NEXT: vsetivli a2, 4, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v0, (a1) +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v0, v27 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vsetivli a1, 4, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 0 +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vmsne.vi v25, v26, 0 +; CHECK-NEXT: vse1.v v25, (a0) +; CHECK-NEXT: ret + %v = load <8 x i1>, <8 x i1>* %vp + %sv = load <4 x i1>, <4 x i1>* %svp + %c = call <8 x i1> @llvm.experimental.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 0) + store <8 x i1> %c, <8 x i1>* %vp + ret void +} + +define void @insert_v8i1_v4i1_4(<8 x i1>* %vp, <4 x i1>* %svp) { +; CHECK-LABEL: insert_v8i1_v4i1_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a2, 8, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v27, (a0) +; CHECK-NEXT: vsetivli a2, 4, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v0, (a1) +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v0, v27 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vsetivli a1, 8, e8,m1,tu,mu +; CHECK-NEXT: vslideup.vi v26, v25, 4 +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vmsne.vi v25, v26, 0 +; CHECK-NEXT: vse1.v v25, (a0) +; CHECK-NEXT: ret + %v = load <8 x i1>, <8 x i1>* %vp + %sv = load <4 x i1>, <4 x i1>* %svp + %c = call <8 x i1> @llvm.experimental.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 4) + store <8 x i1> %c, <8 x i1>* %vp + ret void +} + define @insert_nxv2i16_v2i16_0( %v, <2 x i16>* %svp) { ; CHECK-LABEL: insert_nxv2i16_v2i16_0: ; CHECK: # %bb.0: @@ -387,11 +481,87 @@ ret %c } +define @insert_nxv2i1_v4i1_0( %v, <4 x i1>* %svp) { +; CHECK-LABEL: insert_nxv2i1_v4i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v27, (a0) +; CHECK-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v0, v27 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vsetivli a0, 4, e8,mf4,tu,mu +; CHECK-NEXT: vslideup.vi v25, v26, 0 +; CHECK-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: ret + %sv = load <4 x i1>, <4 x i1>* %svp + %c = call @llvm.experimental.vector.insert.v4i1.nxv2i1( %v, <4 x i1> %sv, i64 0) + ret %c +} + +define @insert_nxv2i1_v4i1_6( %v, <4 x i1>* %svp) { +; CHECK-LABEL: insert_nxv2i1_v4i1_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v27, (a0) +; CHECK-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v0, v27 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vsetivli a0, 10, e8,mf4,tu,mu +; CHECK-NEXT: vslideup.vi v25, v26, 6 +; CHECK-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: ret + %sv = load <4 x i1>, <4 x i1>* %svp + %c = call @llvm.experimental.vector.insert.v4i1.nxv2i1( %v, <4 x i1> %sv, i64 6) + ret %c +} + +define @insert_nxv8i1_v4i1_0( %v, <8 x i1>* %svp) { +; CHECK-LABEL: insert_nxv8i1_v4i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: vsetivli a0, 1, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v0, v25, 0 +; CHECK-NEXT: ret + %sv = load <8 x i1>, <8 x i1>* %svp + %c = call @llvm.experimental.vector.insert.v8i1.nxv8i1( %v, <8 x i1> %sv, i64 0) + ret %c +} + +define @insert_nxv8i1_v8i1_16( %v, <8 x i1>* %svp) { +; CHECK-LABEL: insert_nxv8i1_v8i1_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: vsetivli a0, 3, e8,mf8,tu,mu +; CHECK-NEXT: vslideup.vi v0, v25, 2 +; CHECK-NEXT: ret + %sv = load <8 x i1>, <8 x i1>* %svp + %c = call @llvm.experimental.vector.insert.v8i1.nxv8i1( %v, <8 x i1> %sv, i64 16) + ret %c +} + +declare <8 x i1> @llvm.experimental.vector.insert.v4i1.v8i1(<8 x i1>, <4 x i1>, i64) +declare <32 x i1> @llvm.experimental.vector.insert.v8i1.v32i1(<32 x i1>, <8 x i1>, i64) + declare <4 x i16> @llvm.experimental.vector.insert.v2i16.v4i16(<4 x i16>, <2 x i16>, i64) declare <4 x i32> @llvm.experimental.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64) declare <8 x i32> @llvm.experimental.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64) +declare @llvm.experimental.vector.insert.v4i1.nxv2i1(, <4 x i1>, i64) +declare @llvm.experimental.vector.insert.v8i1.nxv8i1(, <8 x i1>, i64) + declare @llvm.experimental.vector.insert.v2i16.nxv2i16(, <2 x i16>, i64) declare @llvm.experimental.vector.insert.v2i32.nxv8i32(, <2 x i32>, i64) diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll @@ -389,6 +389,78 @@ ret %v } +define @insert_nxv32i1_nxv8i1_0( %v, %sv) { +; CHECK-LABEL: insert_nxv32i1_nxv8i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a0, a0, e8,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v0, v8, 0 +; CHECK-NEXT: ret + %vec = call @llvm.experimental.vector.insert.nxv8i1.nxv32i1( %v, %sv, i64 0) + ret %vec +} + +define @insert_nxv32i1_nxv8i1_8( %v, %sv) { +; CHECK-LABEL: insert_nxv32i1_nxv8i1_8: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli a1, a1, e8,mf2,tu,mu +; CHECK-NEXT: vslideup.vx v0, v8, a0 +; CHECK-NEXT: ret + %vec = call @llvm.experimental.vector.insert.nxv8i1.nxv32i1( %v, %sv, i64 8) + ret %vec +} + +define @insert_nxv4i1_nxv1i1_0( %v, %sv) { +; CHECK-LABEL: insert_nxv4i1_nxv1i1_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e8,mf8,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a0, a0, e8,mf2,tu,mu +; CHECK-NEXT: vslideup.vi v25, v26, 0 +; CHECK-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: ret + %vec = call @llvm.experimental.vector.insert.nxv1i1.nxv4i1( %v, %sv, i64 0) + ret %vec +} + +define @insert_nxv4i1_nxv1i1_2( %v, %sv) { +; CHECK-LABEL: insert_nxv4i1_nxv1i1_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; CHECK-NEXT: vmv.v.i v25, 0 +; CHECK-NEXT: vmerge.vim v25, v25, 1, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: vsetvli a2, zero, e8,mf8,ta,mu +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v26, v26, 1, v0 +; CHECK-NEXT: vsetvli a0, a0, e8,mf2,tu,mu +; CHECK-NEXT: vslideup.vx v25, v26, a1 +; CHECK-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; CHECK-NEXT: vmsne.vi v0, v25, 0 +; CHECK-NEXT: ret + %vec = call @llvm.experimental.vector.insert.nxv1i1.nxv4i1( %v, %sv, i64 2) + ret %vec +} + +declare @llvm.experimental.vector.insert.nxv1i1.nxv4i1(, , i64) +declare @llvm.experimental.vector.insert.nxv8i1.nxv32i1(, , i64) + declare @llvm.experimental.vector.insert.nxv1i8.nxv16i8(, , i64) declare @llvm.experimental.vector.insert.nxv1f16.nxv32f16(, , i64)