diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1036,6 +1036,15 @@ setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMINNUM); + if (Subtarget.hasStdExtZbb()) { + setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::SMIN); + } if (Subtarget.hasStdExtZbp()) { setTargetDAGCombine(ISD::ROTL); setTargetDAGCombine(ISD::ROTR); @@ -7195,6 +7204,108 @@ return matchRISCVBitmanipPat(Op, BitmanipMasks); } +// Try to fold ( x, (reduction. vec, start)) +static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG) { + auto BinOpToRVVReduce = [](unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Unhandled binary to transfrom reduction"); + case ISD::ADD: + return RISCVISD::VECREDUCE_ADD_VL; + case ISD::UMAX: + return RISCVISD::VECREDUCE_UMAX_VL; + case ISD::SMAX: + return RISCVISD::VECREDUCE_SMAX_VL; + case ISD::UMIN: + return RISCVISD::VECREDUCE_UMIN_VL; + case ISD::SMIN: + return RISCVISD::VECREDUCE_SMIN_VL; + case ISD::AND: + return RISCVISD::VECREDUCE_AND_VL; + case ISD::OR: + return RISCVISD::VECREDUCE_OR_VL; + case ISD::XOR: + return RISCVISD::VECREDUCE_XOR_VL; + case ISD::FADD: + return RISCVISD::VECREDUCE_FADD_VL; + case ISD::FMAXNUM: + return RISCVISD::VECREDUCE_FMAX_VL; + case ISD::FMINNUM: + return RISCVISD::VECREDUCE_FMIN_VL; + } + }; + + auto IsReduction = [&BinOpToRVVReduce](SDValue V, unsigned Opc) { + return V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V.getOperand(0).getOpcode() == BinOpToRVVReduce(Opc); + }; + + unsigned Opc = N->getOpcode(); + unsigned ReduceIdx; + if (IsReduction(N->getOperand(0), Opc)) + ReduceIdx = 0; + else if (IsReduction(N->getOperand(1), Opc)) + ReduceIdx = 1; + else + return SDValue(); + + // Skip if FADD disallow reassociation but the combiner needs. + if (Opc == ISD::FADD && !N->getFlags().hasAllowReassociation() && + ReduceIdx == 0) + return SDValue(); + + SDValue Extract = N->getOperand(ReduceIdx); + SDValue Reduce = Extract->getOperand(0); + if (!Reduce->hasOneUse()) + return SDValue(); + + SDValue ScalarV = Reduce->getOperand(2); + + // Make sure the scalar vector of Reduce is splat of neutral element. + if (ScalarV.getOpcode() != RISCVISD::VFMV_S_F_VL && + ScalarV.getOpcode() != RISCVISD::VMV_S_X_VL && + ScalarV.getOpcode() != RISCVISD::VMV_V_X_VL) + return SDValue(); + + // TODO: Deal with value other than neutral element. + auto IsRVVNeutralElement = [Opc, &DAG](SDNode *N, SDValue V) { + if (Opc == ISD::FADD && N->getFlags().hasNoSignedZeros()) { + auto C = dyn_cast(V.getNode()); + return C && C->isZero(); + } + return DAG.getNeutralElement(Opc, SDLoc(V), V.getSimpleValueType(), + N->getFlags()) == V; + }; + + // Check the scalar of ScalarV is neutral element + if (!IsRVVNeutralElement(N, ScalarV.getOperand(1))) + return SDValue(); + + if (!ScalarV.hasOneUse()) + return SDValue(); + + EVT SplatVT = ScalarV.getValueType(); + SDValue NewStart = N->getOperand(1 - ReduceIdx); + unsigned SplatOpc = RISCVISD::VFMV_S_F_VL; + if (SplatVT.isInteger()) { + auto *C = dyn_cast(NewStart.getNode()); + if (!C || C->isZero() || !isInt<5>(C->getSExtValue())) + SplatOpc = RISCVISD::VMV_S_X_VL; + else + SplatOpc = RISCVISD::VMV_V_X_VL; + } + + SDValue NewScalarV = + DAG.getNode(SplatOpc, SDLoc(N), SplatVT, ScalarV.getOperand(0), NewStart, + ScalarV.getOperand(2)); + SDValue NewReduce = + DAG.getNode(Reduce.getOpcode(), SDLoc(Reduce), Reduce.getValueType(), + Reduce.getOperand(0), Reduce.getOperand(1), NewScalarV, + Reduce.getOperand(3), Reduce.getOperand(4)); + return DAG.getNode(Extract.getOpcode(), SDLoc(Extract), + Extract.getValueType(), NewReduce, Extract.getOperand(1)); +} + // Match the following pattern as a GREVI(W) operation // (or (BITMANIP_SHL x), (BITMANIP_SRL x)) static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG, @@ -7657,6 +7768,8 @@ return V; if (SDValue V = transformAddShlImm(N, DAG, Subtarget)) return V; + if (SDValue V = combineBinOpToReduce(N, DAG)) + return V; // fold (add (select lhs, rhs, cc, 0, y), x) -> // (select lhs, rhs, cc, x, (add x, y)) return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false); @@ -7671,6 +7784,8 @@ } static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG) { + if (SDValue V = combineBinOpToReduce(N, DAG)) + return V; // fold (and (select lhs, rhs, cc, -1, y), x) -> // (select lhs, rhs, cc, x, (and x, y)) return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ true); @@ -7687,17 +7802,49 @@ return SHFL; } + if (SDValue V = combineBinOpToReduce(N, DAG)) + return V; // fold (or (select cond, 0, y), x) -> // (select cond, x, (or x, y)) return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false); } static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG) { + if (SDValue V = combineBinOpToReduce(N, DAG)) + return V; // fold (xor (select cond, 0, y), x) -> // (select cond, x, (xor x, y)) return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false); } +static SDValue performFADDCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + +static SDValue performFMAXNUMCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + +static SDValue performFMINNUMCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + +static SDValue performUMAXCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + +static SDValue performUMINCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + +static SDValue performSMAXCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + +static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) { + return combineBinOpToReduce(N, DAG); +} + static SDValue performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { @@ -8303,6 +8450,20 @@ return performORCombine(N, DAG, Subtarget); case ISD::XOR: return performXORCombine(N, DAG); + case ISD::FADD: + return performFADDCombine(N, DAG); + case ISD::UMAX: + return performUMAXCombine(N, DAG); + case ISD::UMIN: + return performUMINCombine(N, DAG); + case ISD::SMAX: + return performSMAXCombine(N, DAG); + case ISD::SMIN: + return performSMINCombine(N, DAG); + case ISD::FMAXNUM: + return performFMAXNUMCombine(N, DAG); + case ISD::FMINNUM: + return performFMINNUMCombine(N, DAG); case ISD::SIGN_EXTEND_INREG: return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget); case ISD::ZERO_EXTEND: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+experimental-zvfh,+f,+d -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>) @@ -38,12 +38,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v2f16(half %s, <2 x half> %v) @@ -71,12 +68,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v4f16(half %s, <4 x half> %v) @@ -104,12 +98,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x half>, <8 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v8f16(half %s, <8 x half> %v) @@ -137,12 +128,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 -; CHECK-NEXT: vfmv.s.f v10, ft0 +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x half>, <16 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v16f16(half %s, <16 x half> %v) @@ -171,14 +159,11 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v12, ft0 +; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x half>, <32 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v32f16(half %s, <32 x half> %v) @@ -210,14 +195,11 @@ ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v16, ft0 +; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x half>, <64 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v64f16(half %s, <64 x half> %v) @@ -252,14 +234,11 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v16, ft0 +; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <128 x half>, <128 x half>* %x %red = call reassoc half @llvm.vector.reduce.fadd.v128f16(half %s, <128 x half> %v) @@ -360,12 +339,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v2f32(float %s, <2 x float> %v) @@ -391,15 +367,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %e = fpext <2 x half> %v to <2 x float> @@ -432,12 +405,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %s, <4 x float> %v) @@ -463,15 +433,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %e = fpext <4 x half> %v to <4 x float> @@ -504,12 +471,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 -; CHECK-NEXT: vfmv.s.f v10, ft0 +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, <8 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v8f32(float %s, <8 x float> %v) @@ -535,15 +499,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x half>, <8 x half>* %x %e = fpext <8 x half> %v to <8 x float> @@ -576,12 +537,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 -; CHECK-NEXT: vfmv.s.f v12, ft0 +; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v12 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, <16 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v16f32(float %s, <16 x float> %v) @@ -607,15 +565,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v10, ft0 +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v10 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x half>, <16 x half>* %x %e = fpext <16 x half> %v to <16 x float> @@ -649,14 +604,11 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v16, ft0 +; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, <32 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v32f32(float %s, <32 x float> %v) @@ -686,15 +638,12 @@ ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v12, ft0 +; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v12 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x half>, <32 x half>* %x %e = fpext <32 x half> %v to <32 x float> @@ -732,14 +681,11 @@ ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vfadd.vv v8, v8, v16 -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v16, ft0 +; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x float>, <64 x float>* %x %red = call reassoc float @llvm.vector.reduce.fadd.v64f32(float %s, <64 x float> %v) @@ -784,14 +730,11 @@ ; CHECK-NEXT: vfwcvt.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, mu ; CHECK-NEXT: vfadd.vv v8, v16, v24 -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v16, ft0 +; CHECK-NEXT: vfmv.s.f v16, fa0 ; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x half>, <64 x half>* %x %e = fpext <64 x half> %v to <64 x float> @@ -893,29 +836,14 @@ declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) define double @vreduce_fadd_v2f64(<2 x double>* %x, double %s) { -; RV32-LABEL: vreduce_fadd_v2f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vfmv.s.f v9, ft0 -; RV32-NEXT: vfredusum.vs v8, v8, v9 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_v2f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vfmv.s.f v9, ft0 -; RV64-NEXT: vfredusum.vs v8, v8, v9 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v2f64(double %s, <2 x double> %v) ret double %red @@ -936,35 +864,17 @@ } define double @vreduce_fwadd_v2f64(<2 x float>* %x, double %s) { -; RV32-LABEL: vreduce_fwadd_v2f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v9, ft0 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v9 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_v2f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v9, ft0 -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %e = fpext <2 x float> %v to <2 x double> %red = call reassoc double @llvm.vector.reduce.fadd.v2f64(double %s, <2 x double> %e) @@ -992,29 +902,14 @@ declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) define double @vreduce_fadd_v4f64(<4 x double>* %x, double %s) { -; RV32-LABEL: vreduce_fadd_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vfmv.s.f v10, ft0 -; RV32-NEXT: vfredusum.vs v8, v8, v10 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vfmv.s.f v10, ft0 -; RV64-NEXT: vfredusum.vs v8, v8, v10 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v4f64(double %s, <4 x double> %v) ret double %red @@ -1035,35 +930,17 @@ } define double @vreduce_fwadd_v4f64(<4 x float>* %x, double %s) { -; RV32-LABEL: vreduce_fwadd_v4f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v9, ft0 -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v9 -; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_v4f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v9, ft0 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v9 -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %e = fpext <4 x float> %v to <4 x double> %red = call reassoc double @llvm.vector.reduce.fadd.v4f64(double %s, <4 x double> %e) @@ -1091,29 +968,14 @@ declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) define double @vreduce_fadd_v8f64(<8 x double>* %x, double %s) { -; RV32-LABEL: vreduce_fadd_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vfmv.s.f v12, ft0 -; RV32-NEXT: vfredusum.vs v8, v8, v12 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vfmv.s.f v12, ft0 -; RV64-NEXT: vfredusum.vs v8, v8, v12 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <8 x double>, <8 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v8f64(double %s, <8 x double> %v) ret double %red @@ -1134,35 +996,17 @@ } define double @vreduce_fwadd_v8f64(<8 x float>* %x, double %s) { -; RV32-LABEL: vreduce_fwadd_v8f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v10, ft0 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v10 -; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_v8f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v10, ft0 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v10 -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <8 x float>, <8 x float>* %x %e = fpext <8 x float> %v to <8 x double> %red = call reassoc double @llvm.vector.reduce.fadd.v8f64(double %s, <8 x double> %e) @@ -1190,29 +1034,14 @@ declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) define double @vreduce_fadd_v16f64(<16 x double>* %x, double %s) { -; RV32-LABEL: vreduce_fadd_v16f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vfmv.s.f v16, ft0 -; RV32-NEXT: vfredusum.vs v8, v8, v16 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_v16f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vfmv.s.f v16, ft0 -; RV64-NEXT: vfredusum.vs v8, v8, v16 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_v16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <16 x double>, <16 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v16f64(double %s, <16 x double> %v) ret double %red @@ -1233,35 +1062,17 @@ } define double @vreduce_fwadd_v16f64(<16 x float>* %x, double %s) { -; RV32-LABEL: vreduce_fwadd_v16f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v12, ft0 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v12 -; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_v16f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v12, ft0 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v12 -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_v16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <16 x float>, <16 x float>* %x %e = fpext <16 x float> %v to <16 x double> %red = call reassoc double @llvm.vector.reduce.fadd.v16f64(double %s, <16 x double> %e) @@ -1289,35 +1100,17 @@ declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>) define double @vreduce_fadd_v32f64(<32 x double>* %x, double %s) { -; RV32-LABEL: vreduce_fadd_v32f64: -; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: addi a0, a0, 128 -; RV32-NEXT: vle64.v v16, (a0) -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vfmv.s.f v24, ft0 -; RV32-NEXT: vfadd.vv v8, v8, v16 -; RV32-NEXT: vfredusum.vs v8, v8, v24 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_v32f64: -; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu -; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: addi a0, a0, 128 -; RV64-NEXT: vle64.v v16, (a0) -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vfmv.s.f v24, ft0 -; RV64-NEXT: vfadd.vv v8, v8, v16 -; RV64-NEXT: vfredusum.vs v8, v8, v24 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_v32f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle64.v v16, (a0) +; CHECK-NEXT: vfmv.s.f v24, fa0 +; CHECK-NEXT: vfadd.vv v8, v8, v16 +; CHECK-NEXT: vfredusum.vs v8, v8, v24 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call reassoc double @llvm.vector.reduce.fadd.v32f64(double %s, <32 x double> %v) ret double %red @@ -1343,45 +1136,22 @@ } define double @vreduce_fwadd_v32f64(<32 x float>* %x, double %s) { -; RV32-LABEL: vreduce_fwadd_v32f64: -; RV32: # %bb.0: -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV32-NEXT: vslidedown.vi v16, v8, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vfwcvt.f.f.v v24, v16 -; RV32-NEXT: vfwcvt.f.f.v v16, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV32-NEXT: vfadd.vv v8, v16, v24 -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vfmv.s.f v16, ft0 -; RV32-NEXT: vfredusum.vs v8, v8, v16 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_v32f64: -; RV64: # %bb.0: -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, mu -; RV64-NEXT: vslidedown.vi v16, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV64-NEXT: vfwcvt.f.f.v v24, v16 -; RV64-NEXT: vfwcvt.f.f.v v16, v8 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vfadd.vv v8, v16, v24 -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vfmv.s.f v16, ft0 -; RV64-NEXT: vfredusum.vs v8, v8, v16 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_v32f64: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; CHECK-NEXT: vfwcvt.f.f.v v24, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; CHECK-NEXT: vfadd.vv v8, v16, v24 +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %v = load <32 x float>, <32 x float>* %x %e = fpext <32 x float> %v to <32 x double> %red = call reassoc double @llvm.vector.reduce.fadd.v32f64(double %s, <32 x double> %e) @@ -2030,10 +1800,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float %s, <4 x float> %v) diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll @@ -0,0 +1,306 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+zbb -riscv-v-vector-bits-min=128 -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s + +define i64 @reduce_add(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) + %res = add i64 %rdx, %x + ret i64 %res +} + +define i64 @reduce_add2(<4 x i64> %v) { +; CHECK-LABEL: reduce_add2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredsum.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) + %res = add i64 %rdx, 8 + ret i64 %res +} + +define i64 @reduce_and(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_and: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredand.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) + %res = and i64 %rdx, %x + ret i64 %res +} + +define i64 @reduce_and2(<4 x i64> %v) { +; CHECK-LABEL: reduce_and2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredand.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) + %res = and i64 %rdx, 8 + ret i64 %res +} + +define i64 @reduce_or(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_or: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredor.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) + %res = or i64 %rdx, %x + ret i64 %res +} + +define i64 @reduce_or2(<4 x i64> %v) { +; CHECK-LABEL: reduce_or2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredor.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) + %res = or i64 %rdx, 8 + ret i64 %res +} + +define i64 @reduce_xor(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_xor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredxor.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) + %res = xor i64 %rdx, %x + ret i64 %res +} + +define i64 @reduce_xor2(<4 x i64> %v) { +; CHECK-LABEL: reduce_xor2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredxor.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: andi a0, a0, 8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) + %res = and i64 %rdx, 8 + ret i64 %res +} + +define i64 @reduce_umax(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_umax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredmaxu.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) + %res = call i64 @llvm.umax.i64(i64 %rdx, i64 %x) + ret i64 %res +} + +define i64 @reduce_umax2(<4 x i64> %v) { +; CHECK-LABEL: reduce_umax2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredmaxu.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) + %res = call i64 @llvm.umax.i64(i64 %rdx, i64 8) + ret i64 %res +} + +define i64 @reduce_umin(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_umin: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredminu.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) + %res = call i64 @llvm.umin.i64(i64 %rdx, i64 %x) + ret i64 %res +} + +define i64 @reduce_umin2(<4 x i64> %v) { +; CHECK-LABEL: reduce_umin2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredminu.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) + %res = call i64 @llvm.umin.i64(i64 %rdx, i64 8) + ret i64 %res +} + +define i64 @reduce_smax(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_smax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredmax.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) + %res = call i64 @llvm.smax.i64(i64 %rdx, i64 %x) + ret i64 %res +} + +define i64 @reduce_smax2(<4 x i64> %v) { +; CHECK-LABEL: reduce_smax2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredmax.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) + %res = call i64 @llvm.smax.i64(i64 %rdx, i64 8) + ret i64 %res +} + +define i64 @reduce_smin(i64 %x, <4 x i64> %v) { +; CHECK-LABEL: reduce_smin: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredmin.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) + %res = call i64 @llvm.smin.i64(i64 %rdx, i64 %x) + ret i64 %res +} + +define i64 @reduce_smin2(<4 x i64> %v) { +; CHECK-LABEL: reduce_smin2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vmv.v.i v10, 8 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vredmin.vs v8, v8, v10 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) + %res = call i64 @llvm.smin.i64(i64 %rdx, i64 8) + ret i64 %res +} + +define float @reduce_fadd(float %x, <4 x float> %v) { +; CHECK-LABEL: reduce_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float %x, <4 x float> %v) + ret float %rdx +} + +define float @reduce_fmax(float %x, <4 x float> %v) { +; CHECK-LABEL: reduce_fmax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfredmax.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) + %res = call float @llvm.maxnum.f32(float %x, float %rdx) + ret float %res +} + +define float @reduce_fmin(float %x, <4 x float> %v) { +; CHECK-LABEL: reduce_fmin: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfredmin.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret +entry: + %rdx = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) + %res = call float @llvm.minnum.f32(float %x, float %rdx) + ret float %res +} + +; Function Attrs: nofree nosync nounwind readnone willreturn +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare i64 @llvm.umax.i64(i64, i64) +declare i64 @llvm.umin.i64(i64, i64) +declare i64 @llvm.smax.i64(i64, i64) +declare i64 @llvm.smin.i64(i64, i64) +declare float @llvm.maxnum.f32(float ,float) +declare float @llvm.minnum.f32(float ,float) diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -1,22 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: -verify-machineinstrs < %s | FileCheck %s declare half @llvm.vector.reduce.fadd.nxv1f16(half, ) define half @vreduce_fadd_nxv1f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f16: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv1f16(half %s, %v) ret half %red @@ -40,14 +37,11 @@ define half @vreduce_fadd_nxv2f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv2f16(half %s, %v) ret half %red @@ -71,14 +65,11 @@ define half @vreduce_fadd_nxv4f16( %v, half %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.h.x ft0, zero -; CHECK-NEXT: fneg.h ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.h fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc half @llvm.vector.reduce.fadd.nxv4f16(half %s, %v) ret half %red @@ -102,14 +93,11 @@ define float @vreduce_fadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) ret float %red @@ -131,15 +119,12 @@ define float @vreduce_fwadd_nxv1f32( %v, float %s) { ; CHECK-LABEL: vreduce_fwadd_nxv1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc float @llvm.vector.reduce.fadd.nxv1f32(float %s, %e) @@ -166,14 +151,11 @@ define float @vreduce_fadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, %v) ret float %red @@ -195,15 +177,12 @@ define float @vreduce_fwadd_nxv2f32( %v, float %s) { ; CHECK-LABEL: vreduce_fwadd_nxv2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc float @llvm.vector.reduce.fadd.nxv2f32(float %s, %e) @@ -230,14 +209,11 @@ define float @vreduce_fadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_fadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v10, ft0 +; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, %v) ret float %red @@ -259,15 +235,12 @@ define float @vreduce_fwadd_nxv4f32( %v, float %s) { ; CHECK-LABEL: vreduce_fwadd_nxv4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: fneg.s ft0, ft0 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vfmv.s.f v9, ft0 +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vfwredusum.vs v8, v8, v9 ; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, mu -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float %s, %e) @@ -292,29 +265,14 @@ declare double @llvm.vector.reduce.fadd.nxv1f64(double, ) define double @vreduce_fadd_nxv1f64( %v, double %s) { -; RV32-LABEL: vreduce_fadd_nxv1f64: -; RV32: # %bb.0: -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v9, ft0 -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vfredusum.vs v8, v8, v9 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_nxv1f64: -; RV64: # %bb.0: -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v9, ft0 -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vfredusum.vs v8, v8, v9 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; CHECK-NEXT: vfredusum.vs v8, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, %v) ret double %red } @@ -333,31 +291,15 @@ } define double @vreduce_fwadd_nxv1f64( %v, double %s) { -; RV32-LABEL: vreduce_fwadd_nxv1f64: -; RV32: # %bb.0: -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v9, ft0 -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v9 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_nxv1f64: -; RV64: # %bb.0: -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v9, ft0 -; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc double @llvm.vector.reduce.fadd.nxv1f64(double %s, %e) ret double %red @@ -381,29 +323,14 @@ declare double @llvm.vector.reduce.fadd.nxv2f64(double, ) define double @vreduce_fadd_nxv2f64( %v, double %s) { -; RV32-LABEL: vreduce_fadd_nxv2f64: -; RV32: # %bb.0: -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v10, ft0 -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vfredusum.vs v8, v8, v10 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_nxv2f64: -; RV64: # %bb.0: -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v10, ft0 -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vfredusum.vs v8, v8, v10 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; CHECK-NEXT: vfredusum.vs v8, v8, v10 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, %v) ret double %red } @@ -422,31 +349,15 @@ } define double @vreduce_fwadd_nxv2f64( %v, double %s) { -; RV32-LABEL: vreduce_fwadd_nxv2f64: -; RV32: # %bb.0: -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v9, ft0 -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v9 -; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_nxv2f64: -; RV64: # %bb.0: -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v9, ft0 -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v9 -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc double @llvm.vector.reduce.fadd.nxv2f64(double %s, %e) ret double %red @@ -470,29 +381,14 @@ declare double @llvm.vector.reduce.fadd.nxv4f64(double, ) define double @vreduce_fadd_nxv4f64( %v, double %s) { -; RV32-LABEL: vreduce_fadd_nxv4f64: -; RV32: # %bb.0: -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v12, ft0 -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vfredusum.vs v8, v8, v12 -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fadd_nxv4f64: -; RV64: # %bb.0: -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v12, ft0 -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vfredusum.vs v8, v8, v12 -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fadd_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; CHECK-NEXT: vfredusum.vs v8, v8, v12 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, %v) ret double %red } @@ -511,31 +407,15 @@ } define double @vreduce_fwadd_nxv4f64( %v, double %s) { -; RV32-LABEL: vreduce_fwadd_nxv4f64: -; RV32: # %bb.0: -; RV32-NEXT: fcvt.d.w ft0, zero -; RV32-NEXT: fneg.d ft0, ft0 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV32-NEXT: vfmv.s.f v10, ft0 -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vfwredusum.vs v8, v8, v10 -; RV32-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV32-NEXT: vfmv.f.s ft0, v8 -; RV32-NEXT: fadd.d fa0, fa0, ft0 -; RV32-NEXT: ret -; -; RV64-LABEL: vreduce_fwadd_nxv4f64: -; RV64: # %bb.0: -; RV64-NEXT: fmv.d.x ft0, zero -; RV64-NEXT: fneg.d ft0, ft0 -; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; RV64-NEXT: vfmv.s.f v10, ft0 -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vfwredusum.vs v8, v8, v10 -; RV64-NEXT: vsetivli zero, 0, e64, m1, ta, mu -; RV64-NEXT: vfmv.f.s ft0, v8 -; RV64-NEXT: fadd.d fa0, fa0, ft0 -; RV64-NEXT: ret +; CHECK-LABEL: vreduce_fwadd_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret %e = fpext %v to %red = call reassoc double @llvm.vector.reduce.fadd.nxv4f64(double %s, %e) ret double %red @@ -1160,11 +1040,10 @@ ; CHECK-LABEL: vreduce_nsz_fadd_nxv1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, mu -; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, mu ; CHECK-NEXT: vfredusum.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s ft0, v8 -; CHECK-NEXT: fadd.s fa0, fa0, ft0 +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %red = call reassoc nsz float @llvm.vector.reduce.fadd.nxv1f32(float %s, %v) ret float %red