diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -243,6 +243,9 @@ bool ForCodeSize) const override; bool hasBitPreservingFPLogic(EVT VT) const override; + bool + shouldExpandBuildVectorWithShuffles(EVT VT, + unsigned DefinedValues) const override; // Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -554,6 +554,7 @@ } setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); @@ -610,6 +611,7 @@ setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::LOAD, VT, Custom); @@ -1021,6 +1023,19 @@ return getDefaultVLOps(VecVT, VecVT, DL, DAG, Subtarget); } +// The state of RVV BUILD_VECTOR and VECTOR_SHUFFLE lowering is that very few +// of either is (currently) supported. This can get us into an infinite loop +// where we try to lower a BUILD_VECTOR as a VECTOR_SHUFFLE as a BUILD_VECTOR +// as a ..., etc. +// Until either (or both) of these can reliably lower any node, reporting that +// we don't want to expand BUILD_VECTORs via VECTOR_SHUFFLEs at least breaks +// the infinite loop. Note that this lowers BUILD_VECTOR through the stack, +// which is not desirable. +bool RISCVTargetLowering::shouldExpandBuildVectorWithShuffles( + EVT VT, unsigned DefinedValues) const { + return false; +} + static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -2179,6 +2194,16 @@ SDValue Val = Op.getOperand(1); SDValue Idx = Op.getOperand(2); + MVT ContainerVT = VecVT; + // If the operand is a fixed-length vector, convert to a scalable one. + if (VecVT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(DAG, VecVT, Subtarget); + Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + } + + SDValue Mask, VL; + std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); + // Custom-legalize INSERT_VECTOR_ELT where XLEN>=SEW, so that the vector is // first slid down into position, the value is inserted into the first // position, and the vector is slid back up. We do this to simplify patterns. @@ -2186,21 +2211,17 @@ if (Subtarget.is64Bit() || Val.getValueType() != MVT::i64) { if (isNullConstant(Idx)) return Op; - SDValue Mask, VL; - std::tie(Mask, VL) = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget); - SDValue Slidedown = DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VecVT, - DAG.getUNDEF(VecVT), Vec, Idx, Mask, VL); + SDValue Slidedown = + DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), Vec, Idx, Mask, VL); SDValue InsertElt0 = - DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Slidedown, Val, + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Slidedown, Val, DAG.getConstant(0, DL, Subtarget.getXLenVT())); - return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VecVT, Vec, InsertElt0, Idx, - Mask, VL); + return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec, InsertElt0, + Idx, Mask, VL); } - if (!VecVT.isScalableVector()) - return SDValue(); - // Custom-legalize INSERT_VECTOR_ELT where XLEN* %x, i64 %y) { ; RV32-LABEL: insertelt_v4i64: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -128 -; RV32-NEXT: .cfi_def_cfa_offset 128 -; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: addi s0, sp, 128 -; RV32-NEXT: .cfi_def_cfa s0, 0 -; RV32-NEXT: andi sp, sp, -32 -; RV32-NEXT: sw a2, 32(sp) -; RV32-NEXT: sw a1, 64(sp) -; RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu -; RV32-NEXT: vle32.v v26, (a0) -; RV32-NEXT: vmv.x.s a1, v26 -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: addi a1, sp, 32 -; RV32-NEXT: vle32.v v28, (a1) -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 28(sp) -; RV32-NEXT: addi a1, sp, 64 -; RV32-NEXT: vle32.v v28, (a1) -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 24(sp) -; RV32-NEXT: vsetivli a1, 1, e32,m2,ta,mu -; RV32-NEXT: vslidedown.vi v28, v26, 5 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: vslidedown.vi v28, v26, 4 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 16(sp) -; RV32-NEXT: vslidedown.vi v28, v26, 3 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 12(sp) -; RV32-NEXT: vslidedown.vi v28, v26, 2 -; RV32-NEXT: vmv.x.s a1, v28 -; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: vslidedown.vi v26, v26, 1 -; RV32-NEXT: vmv.x.s a1, v26 -; RV32-NEXT: sw a1, 4(sp) -; RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu -; RV32-NEXT: vle32.v v26, (sp) -; RV32-NEXT: vse32.v v26, (a0) -; RV32-NEXT: addi sp, s0, -128 -; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload -; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 128 +; RV32-NEXT: vsetivli a3, 4, e64,m2,ta,mu +; RV32-NEXT: vle64.v v26, (a0) +; RV32-NEXT: vsetvli a3, zero, e64,m2,ta,mu +; RV32-NEXT: vmv.v.x v28, a2 +; RV32-NEXT: addi a2, zero, 32 +; RV32-NEXT: vsll.vx v28, v28, a2 +; RV32-NEXT: vmv.v.x v30, a1 +; RV32-NEXT: vsll.vx v30, v30, a2 +; RV32-NEXT: vsrl.vx v30, v30, a2 +; RV32-NEXT: vor.vv v28, v30, v28 +; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; RV32-NEXT: vid.v v30 +; RV32-NEXT: vmseq.vi v0, v30, 3 +; RV32-NEXT: vmerge.vvm v26, v26, v28, v0 +; RV32-NEXT: vse64.v v26, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v4i64: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -96 -; RV64-NEXT: .cfi_def_cfa_offset 96 -; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: addi s0, sp, 96 -; RV64-NEXT: .cfi_def_cfa s0, 0 -; RV64-NEXT: andi sp, sp, -32 ; RV64-NEXT: vsetivli a2, 4, e64,m2,ta,mu ; RV64-NEXT: vle64.v v26, (a0) -; RV64-NEXT: sd a1, 32(sp) -; RV64-NEXT: vmv.x.s a1, v26 -; RV64-NEXT: sd a1, 0(sp) -; RV64-NEXT: addi a1, sp, 32 -; RV64-NEXT: vle64.v v28, (a1) -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 24(sp) -; RV64-NEXT: vsetivli a1, 1, e64,m2,ta,mu -; RV64-NEXT: vslidedown.vi v28, v26, 2 -; RV64-NEXT: vmv.x.s a1, v28 -; RV64-NEXT: sd a1, 16(sp) -; RV64-NEXT: vslidedown.vi v26, v26, 1 -; RV64-NEXT: vmv.x.s a1, v26 -; RV64-NEXT: sd a1, 8(sp) +; RV64-NEXT: vslidedown.vi v28, v26, 3 +; RV64-NEXT: vsetvli a2, zero, e64,m2,ta,mu +; RV64-NEXT: vmv.s.x v28, a1 +; RV64-NEXT: vsetivli a1, 4, e64,m2,tu,mu +; RV64-NEXT: vslideup.vi v26, v28, 3 ; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu -; RV64-NEXT: vle64.v v26, (sp) ; RV64-NEXT: vse64.v v26, (a0) -; RV64-NEXT: addi sp, s0, -96 -; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 96 ; RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = insertelement <4 x i64> %a, i64 %y, i32 3 @@ -104,8 +50,39 @@ define void @insertelt_v3i64(<3 x i64>* %x, i64 %y) { ; RV32-LABEL: insertelt_v3i64: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: addi a3, a0, 16 +; RV32-NEXT: vsetivli a4, 2, e32,m1,ta,mu +; RV32-NEXT: vle32.v v25, (a3) +; RV32-NEXT: vse32.v v25, (sp) +; RV32-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v26, (a0) +; RV32-NEXT: vsetivli a3, 8, e32,m2,ta,mu +; RV32-NEXT: vmv.v.i v28, 0 +; RV32-NEXT: vsetivli a3, 2, e64,m2,tu,mu +; RV32-NEXT: vslideup.vi v28, v26, 0 +; RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (sp) +; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu +; RV32-NEXT: vslideup.vi v28, v26, 2 +; RV32-NEXT: vsetvli a3, zero, e64,m2,ta,mu +; RV32-NEXT: vmv.v.x v26, a2 +; RV32-NEXT: addi a3, zero, 32 +; RV32-NEXT: vsll.vx v26, v26, a3 +; RV32-NEXT: vmv.v.x v30, a1 +; RV32-NEXT: vsll.vx v30, v30, a3 +; RV32-NEXT: vsrl.vx v30, v30, a3 +; RV32-NEXT: vor.vv v26, v30, v26 +; RV32-NEXT: vsetivli a3, 4, e64,m2,ta,mu +; RV32-NEXT: vid.v v30 +; RV32-NEXT: vmseq.vi v0, v30, 2 +; RV32-NEXT: vmerge.vvm v26, v28, v26, v0 +; RV32-NEXT: vsetivli a3, 2, e64,m1,ta,mu +; RV32-NEXT: vse64.v v26, (a0) ; RV32-NEXT: sw a1, 16(a0) ; RV32-NEXT: sw a2, 20(a0) +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v3i64: @@ -117,3 +94,103 @@ store <3 x i64> %b, <3 x i64>* %x ret void } + +define void @insertelt_v16i8(<16 x i8>* %x, i8 %y) { +; RV32-LABEL: insertelt_v16i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a2, 16, e8,m1,ta,mu +; RV32-NEXT: vle8.v v25, (a0) +; RV32-NEXT: vslidedown.vi v26, v25, 14 +; RV32-NEXT: vsetvli a2, zero, e8,m1,ta,mu +; RV32-NEXT: vmv.s.x v26, a1 +; RV32-NEXT: vsetivli a1, 16, e8,m1,tu,mu +; RV32-NEXT: vslideup.vi v25, v26, 14 +; RV32-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; RV32-NEXT: vse8.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_v16i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a2, 16, e8,m1,ta,mu +; RV64-NEXT: vle8.v v25, (a0) +; RV64-NEXT: vslidedown.vi v26, v25, 14 +; RV64-NEXT: vsetvli a2, zero, e8,m1,ta,mu +; RV64-NEXT: vmv.s.x v26, a1 +; RV64-NEXT: vsetivli a1, 16, e8,m1,tu,mu +; RV64-NEXT: vslideup.vi v25, v26, 14 +; RV64-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; RV64-NEXT: vse8.v v25, (a0) +; RV64-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = insertelement <16 x i8> %a, i8 %y, i32 14 + store <16 x i8> %b, <16 x i8>* %x + ret void +} + +define void @insertelt_v32i16(<32 x i16>* %x, i16 %y, i32 %idx) { +; RV32-LABEL: insertelt_v32i16: +; RV32: # %bb.0: +; RV32-NEXT: addi a3, zero, 32 +; RV32-NEXT: vsetvli a4, a3, e16,m4,ta,mu +; RV32-NEXT: vle16.v v28, (a0) +; RV32-NEXT: vslidedown.vx v8, v28, a2 +; RV32-NEXT: vsetvli a4, zero, e16,m4,ta,mu +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetvli a1, a3, e16,m4,tu,mu +; RV32-NEXT: vslideup.vx v28, v8, a2 +; RV32-NEXT: vsetvli a1, a3, e16,m4,ta,mu +; RV32-NEXT: vse16.v v28, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_v32i16: +; RV64: # %bb.0: +; RV64-NEXT: addi a3, zero, 32 +; RV64-NEXT: vsetvli a4, a3, e16,m4,ta,mu +; RV64-NEXT: vle16.v v28, (a0) +; RV64-NEXT: sext.w a2, a2 +; RV64-NEXT: vslidedown.vx v8, v28, a2 +; RV64-NEXT: vsetvli a4, zero, e16,m4,ta,mu +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetvli a1, a3, e16,m4,tu,mu +; RV64-NEXT: vslideup.vx v28, v8, a2 +; RV64-NEXT: vsetvli a1, a3, e16,m4,ta,mu +; RV64-NEXT: vse16.v v28, (a0) +; RV64-NEXT: ret + %a = load <32 x i16>, <32 x i16>* %x + %b = insertelement <32 x i16> %a, i16 %y, i32 %idx + store <32 x i16> %b, <32 x i16>* %x + ret void +} + +define void @insertelt_v8f32(<8 x float>* %x, float %y, i32 %idx) { +; RV32-LABEL: insertelt_v8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; RV32-NEXT: vle32.v v26, (a0) +; RV32-NEXT: vslidedown.vx v28, v26, a1 +; RV32-NEXT: vsetvli a2, zero, e32,m2,ta,mu +; RV32-NEXT: vfmv.s.f v28, fa0 +; RV32-NEXT: vsetivli a2, 8, e32,m2,tu,mu +; RV32-NEXT: vslideup.vx v26, v28, a1 +; RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; RV32-NEXT: vse32.v v26, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_v8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; RV64-NEXT: vle32.v v26, (a0) +; RV64-NEXT: sext.w a1, a1 +; RV64-NEXT: vslidedown.vx v28, v26, a1 +; RV64-NEXT: vsetvli a2, zero, e32,m2,ta,mu +; RV64-NEXT: vfmv.s.f v28, fa0 +; RV64-NEXT: vsetivli a2, 8, e32,m2,tu,mu +; RV64-NEXT: vslideup.vx v26, v28, a1 +; RV64-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; RV64-NEXT: vse32.v v26, (a0) +; RV64-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %b = insertelement <8 x float> %a, float %y, i32 %idx + store <8 x float> %b, <8 x float>* %x + ret void +}