diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -127,6 +127,8 @@ VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR, + VECREDUCE_FADD, + VECREDUCE_SEQ_FADD, }; } // namespace RISCVISD @@ -329,6 +331,7 @@ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -472,6 +472,9 @@ // Expand various condition codes (explained above). for (auto CC : VFPCCToExpand) setCondCodeAction(CC, VT, Expand); + + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); }; if (Subtarget.hasStdExtZfh()) @@ -913,6 +916,9 @@ case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: return lowerVECREDUCE(Op, DAG); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_SEQ_FADD: + return lowerFPVECREDUCE(Op, DAG); } } @@ -1689,6 +1695,44 @@ return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType()); } +// Given a reduction op, this function returns the matching reduction opcode, +// the vector SDValue and the scalar SDValue required to lower this to a +// RISCVISD node. +static std::tuple +getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT) { + SDLoc DL(Op); + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unhandled reduction"); + case ISD::VECREDUCE_FADD: + return {RISCVISD::VECREDUCE_FADD, Op.getOperand(0), + DAG.getConstantFP(0.0, DL, EltVT)}; + case ISD::VECREDUCE_SEQ_FADD: + return {RISCVISD::VECREDUCE_SEQ_FADD, Op.getOperand(1), Op.getOperand(0)}; + } +} + +SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecEltVT = Op.getSimpleValueType(); + // We have to perform a bit of a dance to get from our vector type to the + // correct LMUL=1 vector type. See above for an explanation. + unsigned NumElts = 64 / VecEltVT.getSizeInBits(); + MVT M1VT = MVT::getScalableVectorVT(VecEltVT, NumElts); + + unsigned RVVOpcode; + SDValue VectorVal, ScalarVal; + std::tie(RVVOpcode, VectorVal, ScalarVal) = + getRVVFPReductionOpAndOperands(Op, DAG, VecEltVT); + + SDValue ZeroIdx = DAG.getConstant(0, DL, Subtarget.getXLenVT()); + SDValue ScalarInVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, M1VT, + DAG.getUNDEF(M1VT), ScalarVal, ZeroIdx); + SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, VectorVal, ScalarInVec); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction, ZeroIdx); +} + // Returns the opcode of the target-specific SDNode that implements the 32-bit // form of the given Opcode. static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) { @@ -4248,6 +4292,8 @@ NODE_NAME_CASE(VECREDUCE_AND) NODE_NAME_CASE(VECREDUCE_OR) NODE_NAME_CASE(VECREDUCE_XOR) + NODE_NAME_CASE(VECREDUCE_FADD) + NODE_NAME_CASE(VECREDUCE_SEQ_FADD) } // clang-format on return nullptr; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -47,7 +47,8 @@ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2> ]>; -foreach kind = ["ADD", "UMAX", "SMAX", "UMIN", "SMIN", "AND", "OR", "XOR"] in +foreach kind = ["ADD", "UMAX", "SMAX", "UMIN", "SMIN", "AND", "OR", "XOR", + "FADD", "SEQ_FADD"] in def rvv_vecreduce_#kind : SDNode<"RISCVISD::VECREDUCE_"#kind, SDTRVVVecReduce>; multiclass VPatUSLoadStoreSDNode { - foreach vti = AllIntegerVectors in { - defvar vti_m1 = !cast("VI" # vti.SEW # "M1"); +multiclass VPatReductionSDNode { + foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in { + defvar vti_m1 = !cast(!if(is_float, "VF", "VI") # vti.SEW # "M1"); def: Pat<(vti_m1.Vector (vop (vti.Vector vti.RegClass:$rs1), VR:$rs2)), (!cast(instruction_name#"_VS_"#vti.LMul.MX) (vti_m1.Vector (IMPLICIT_DEF)), @@ -486,14 +487,18 @@ } // 15.1. Vector Single-Width Integer Reduction Instructions -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; -defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; + +// 15.3. Vector Single-Width Floating-Point Reduction Instructions +defm "" : VPatReductionSDNode; +defm "" : VPatReductionSDNode; // 16.1. Vector Mask-Register Logical Instructions foreach mti = AllMasks in { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -102,9 +102,19 @@ switch (II->getIntrinsicID()) { default: return false; + // These reductions have no equivalent in RVV case Intrinsic::vector_reduce_mul: - case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: + // The fmin and fmax intrinsics are not currently supported due to a + // discrepancy between the LLVM semantics and the RVV 0.10 ISA behaviour with + // regards to signaling NaNs: the vector fmin/fmax reduction intrinsics match + // the behaviour minnum/maxnum intrinsics, whereas the vfredmin/vfredmax + // instructions match the vfmin/vfmax instructions which match the equivalent + // scalar fmin/fmax instructions as defined in 2.2 F/D/Q extension (see + // https://bugs.llvm.org/show_bug.cgi?id=27363). + // This behaviour is likely fixed in version 2.3 of the RISC-V F/D/Q + // extension, where fmin/fmax behave like minnum/maxnum, but until then the + // intrinsics are left unsupported. case Intrinsic::vector_reduce_fmax: case Intrinsic::vector_reduce_fmin: return true; diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -0,0 +1,467 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64 + +declare half @llvm.vector.reduce.fadd.nxv1f16(half, ) + +define half @vreduce_fadd_nxv1f16( %v, half %s) { +; RV32-LABEL: vreduce_fadd_nxv1f16: +; RV32: # %bb.0: +; RV32-NEXT: fmv.h.x ft0, zero +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.h fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv1f16: +; RV64: # %bb.0: +; RV64-NEXT: fmv.h.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.h fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc half @llvm.vector.reduce.fadd.nxv1f16(half %s, %v) + ret half %red +} + +define half @vreduce_ord_fadd_nxv1f16( %v, half %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv1f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv1f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv1f16(half %s, %v) + ret half %red +} + +declare half @llvm.vector.reduce.fadd.nxv2f16(half, ) + +define half @vreduce_fadd_nxv2f16( %v, half %s) { +; RV32-LABEL: vreduce_fadd_nxv2f16: +; RV32: # %bb.0: +; RV32-NEXT: fmv.h.x ft0, zero +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.h fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv2f16: +; RV64: # %bb.0: +; RV64-NEXT: fmv.h.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.h fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc half @llvm.vector.reduce.fadd.nxv2f16(half %s, %v) + ret half %red +} + +define half @vreduce_ord_fadd_nxv2f16( %v, half %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv2f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv2f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv2f16(half %s, %v) + ret half %red +} + +declare half @llvm.vector.reduce.fadd.nxv4f16(half, ) + +define half @vreduce_fadd_nxv4f16( %v, half %s) { +; RV32-LABEL: vreduce_fadd_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: fmv.h.x ft0, zero +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.h fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: fmv.h.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.h fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc half @llvm.vector.reduce.fadd.nxv4f16(half %s, %v) + ret half %red +} + +define half @vreduce_ord_fadd_nxv4f16( %v, half %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call half @llvm.vector.reduce.fadd.nxv4f16(half %s, %v) + ret half %red +} + +declare float @llvm.vector.reduce.fadd.nxv1f32(float, ) + +define float @vreduce_fadd_nxv1f32( %v, float %s) { +; RV32-LABEL: vreduce_fadd_nxv1f32: +; RV32: # %bb.0: +; RV32-NEXT: fmv.w.x ft0, zero +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.s fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv1f32: +; RV64: # %bb.0: +; RV64-NEXT: fmv.w.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.s fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) + ret float %red +} + +define float @vreduce_ord_fadd_nxv1f32( %v, float %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv1f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv1f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) + ret float %red +} + +declare float @llvm.vector.reduce.fadd.nxv2f32(float, ) + +define float @vreduce_fadd_nxv2f32( %v, float %s) { +; RV32-LABEL: vreduce_fadd_nxv2f32: +; RV32: # %bb.0: +; RV32-NEXT: fmv.w.x ft0, zero +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.s fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv2f32: +; RV64: # %bb.0: +; RV64-NEXT: fmv.w.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.s fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, %v) + ret float %red +} + +define float @vreduce_ord_fadd_nxv2f32( %v, float %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv2f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv2f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call float @llvm.vector.reduce.fadd.nxv2f32(float %s, %v) + ret float %red +} + +declare float @llvm.vector.reduce.fadd.nxv4f32(float, ) + +define float @vreduce_fadd_nxv4f32( %v, float %s) { +; RV32-LABEL: vreduce_fadd_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: fmv.w.x ft0, zero +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.s fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: fmv.w.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.s fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, %v) + ret float %red +} + +define float @vreduce_ord_fadd_nxv4f32( %v, float %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e32,m1,ta,mu +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call float @llvm.vector.reduce.fadd.nxv4f32(float %s, %v) + ret float %red +} + +declare double @llvm.vector.reduce.fadd.nxv1f64(double, ) + +define double @vreduce_fadd_nxv1f64( %v, double %s) { +; RV32-LABEL: vreduce_fadd_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: fcvt.d.w ft0, zero +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.d fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: fmv.d.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.d fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, %v) + ret double %red +} + +define double @vreduce_ord_fadd_nxv1f64( %v, double %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call double @llvm.vector.reduce.fadd.nxv1f64(double %s, %v) + ret double %red +} + +declare double @llvm.vector.reduce.fadd.nxv2f64(double, ) + +define double @vreduce_fadd_nxv2f64( %v, double %s) { +; RV32-LABEL: vreduce_fadd_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: fcvt.d.w ft0, zero +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.d fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: fmv.d.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.d fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, %v) + ret double %red +} + +define double @vreduce_ord_fadd_nxv2f64( %v, double %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call double @llvm.vector.reduce.fadd.nxv2f64(double %s, %v) + ret double %red +} + +declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) + +define double @vreduce_fadd_nxv4f64( %v, double %s) { +; RV32-LABEL: vreduce_fadd_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: fcvt.d.w ft0, zero +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, ft0 +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vfredsum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.f.s ft0, v25 +; RV32-NEXT: fadd.d fa0, fa0, ft0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fadd_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: fmv.d.x ft0, zero +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, ft0 +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vfredsum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.f.s ft0, v25 +; RV64-NEXT: fadd.d fa0, fa0, ft0 +; RV64-NEXT: ret + %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, %v) + ret double %red +} + +define double @vreduce_ord_fadd_nxv4f64( %v, double %s) { +; RV32-LABEL: vreduce_ord_fadd_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.s.f v25, fa0 +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vfredosum.vs v25, v8, v25 +; RV32-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV32-NEXT: vfmv.f.s fa0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_ord_fadd_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.s.f v25, fa0 +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vfredosum.vs v25, v8, v25 +; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu +; RV64-NEXT: vfmv.f.s fa0, v25 +; RV64-NEXT: ret + %red = call double @llvm.vector.reduce.fadd.nxv4f64(double %s, %v) + ret double %red +}