diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1182,6 +1182,21 @@ break; case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_SEQ_FMUL: + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_FMUL: + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + case ISD::VP_REDUCE_FMAX: + case ISD::VP_REDUCE_FMIN: + case ISD::VP_REDUCE_SEQ_FADD: + case ISD::VP_REDUCE_SEQ_FMUL: Action = TLI.getOperationAction( Node->getOpcode(), Node->getOperand(1).getValueType()); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -222,6 +222,18 @@ Res = PromoteIntRes_VECREDUCE(N); break; + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + Res = PromoteIntRes_VP_REDUCE(N); + break; + case ISD::FREEZE: Res = PromoteIntRes_FREEZE(N); break; @@ -1570,6 +1582,15 @@ case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: Res = PromoteIntOp_VECREDUCE(N); break; + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: Res = PromoteIntOp_VP_REDUCE(N, OpNo); break; case ISD::SET_ROUNDING: Res = PromoteIntOp_SET_ROUNDING(N); break; } @@ -2029,30 +2050,54 @@ return SDValue(); } -SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) { - SDLoc dl(N); - SDValue Op; +static unsigned getExtendForIntVecReduction(SDNode *N) { switch (N->getOpcode()) { - default: llvm_unreachable("Expected integer vector reduction"); + default: + llvm_unreachable("Expected integer vector reduction"); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: - Op = GetPromotedInteger(N->getOperand(0)); - break; + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_MUL: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + return ISD::ANY_EXTEND; case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: - Op = SExtPromotedInteger(N->getOperand(0)); - break; + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_SMIN: + return ISD::SIGN_EXTEND; case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: - Op = ZExtPromotedInteger(N->getOperand(0)); - break; + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_UMIN: + return ISD::ZERO_EXTEND; + } +} + +SDValue DAGTypeLegalizer::PromoteIntOpVectorReduction(SDNode *N, SDValue V) { + switch (getExtendForIntVecReduction(N)) { + default: + llvm_unreachable("Impossible extension kind for integer reduction"); + case ISD::ANY_EXTEND: + return GetPromotedInteger(V); + case ISD::SIGN_EXTEND: + return SExtPromotedInteger(V); + case ISD::ZERO_EXTEND: + return ZExtPromotedInteger(V); } +} + +SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) { + SDLoc dl(N); + SDValue Op = PromoteIntOpVectorReduction(N, N->getOperand(0)); EVT EltVT = Op.getValueType().getVectorElementType(); EVT VT = N->getValueType(0); + if (VT.bitsGE(EltVT)) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op); @@ -2062,6 +2107,38 @@ return DAG.getNode(ISD::TRUNCATE, dl, VT, Reduce); } +SDValue DAGTypeLegalizer::PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo) { + SDLoc DL(N); + SDValue Op = N->getOperand(OpNo); + SmallVector NewOps(N->op_begin(), N->op_end()); + + if (OpNo == 2) { // Mask + // Update in place. + NewOps[2] = PromoteTargetBoolean(Op, N->getOperand(1).getValueType()); + return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); + } + + assert(OpNo == 1 && "Unexpected operand for promotion"); + + Op = PromoteIntOpVectorReduction(N, Op); + + NewOps[OpNo] = Op; + + EVT VT = N->getValueType(0); + EVT EltVT = Op.getValueType().getScalarType(); + + if (VT.bitsGE(EltVT)) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, NewOps); + + // Result size must be >= element/start-value size. If this is not the case + // after promotion, also promote both the start value and result type and + // then truncate. + NewOps[0] = + DAG.getNode(getExtendForIntVecReduction(N), DL, EltVT, N->getOperand(0)); + SDValue Reduce = DAG.getNode(N->getOpcode(), DL, EltVT, NewOps); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Reduce); +} + SDValue DAGTypeLegalizer::PromoteIntOp_SET_ROUNDING(SDNode *N) { SDValue Op = ZExtPromotedInteger(N->getOperand(1)); return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0); @@ -4981,7 +5058,17 @@ // we can simply change the result type. SDLoc dl(N); EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); - return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); + return DAG.getNode(N->getOpcode(), dl, NVT, N->ops()); +} + +SDValue DAGTypeLegalizer::PromoteIntRes_VP_REDUCE(SDNode *N) { + // The VP_REDUCE result size may be larger than the element size, so we can + // simply change the result type. However the start value and result must be + // the same. + SDLoc DL(N); + SDValue Start = PromoteIntOpVectorReduction(N, N->getOperand(0)); + return DAG.getNode(N->getOpcode(), DL, Start.getValueType(), Start, + N->getOperand(1), N->getOperand(2), N->getOperand(3)); } SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -289,6 +289,12 @@ return DAG.getZeroExtendInReg(Op, DL, OldVT); } + // Promote the given operand V (vector or scalar) according to N's specific + // reduction kind. N must be an integer VECREDUCE_* or VP_REDUCE_*. Returns + // the nominal extension opcode (ISD::(ANY|ZERO|SIGN)_EXTEND) and the + // promoted value. + SDValue PromoteIntOpVectorReduction(SDNode *N, SDValue V); + // Integer Result Promotion. void PromoteIntegerResult(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo); @@ -354,6 +360,7 @@ SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N); SDValue PromoteIntRes_ISNAN(SDNode *N); SDValue PromoteIntRes_VECREDUCE(SDNode *N); + SDValue PromoteIntRes_VP_REDUCE(SDNode *N); SDValue PromoteIntRes_ABS(SDNode *N); SDValue PromoteIntRes_Rotate(SDNode *N); SDValue PromoteIntRes_FunnelShift(SDNode *N); @@ -395,6 +402,7 @@ SDValue PromoteIntOp_FIX(SDNode *N); SDValue PromoteIntOp_FPOWI(SDNode *N); SDValue PromoteIntOp_VECREDUCE(SDNode *N); + SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_SET_ROUNDING(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -164,12 +164,13 @@ VFNCVT_ROD_VL, // These nodes match the semantics of the corresponding RVV vector reduction // instructions. They produce a vector result which is the reduction - // performed over the first vector operand plus the first element of the - // second vector operand. The first operand is an unconstrained vector type, - // and the result and second operand's types are expected to be the - // corresponding full-width LMUL=1 type for the first operand: - // nxv8i8 = vecreduce_add nxv32i8, nxv8i8 - // nxv2i32 = vecreduce_add nxv8i32, nxv2i32 + // performed over the second vector operand plus the first element of the + // third vector operand. The first operand is the pass-thru operand. The + // second operand is an unconstrained vector type, and the result, first, and + // third operand's types are expected to be the corresponding full-width + // LMUL=1 type for the second operand: + // nxv8i8 = vecreduce_add nxv8i8, nxv32i8, nxv8i8 + // nxv2i32 = vecreduce_add nxv2i32, nxv8i32, nxv2i32 // The different in types does introduce extra vsetvli instructions but // similarly it reduces the number of registers consumed per reduction. // Also has a mask and VL operand. @@ -553,8 +554,10 @@ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVPREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVectorMaskVecReduction(SDValue Op, SelectionDAG &DAG, + bool IsVP) const; SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -431,12 +431,18 @@ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); static unsigned IntegerVPOps[] = { - ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV, - ISD::VP_SREM, ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR, - ISD::VP_ASHR, ISD::VP_LSHR, ISD::VP_SHL}; - - static unsigned FloatingPointVPOps[] = {ISD::VP_FADD, ISD::VP_FSUB, - ISD::VP_FMUL, ISD::VP_FDIV}; + ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL, + ISD::VP_SDIV, ISD::VP_UDIV, ISD::VP_SREM, + ISD::VP_UREM, ISD::VP_AND, ISD::VP_OR, + ISD::VP_XOR, ISD::VP_ASHR, ISD::VP_LSHR, + ISD::VP_SHL, ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND, + ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR, ISD::VP_REDUCE_SMAX, + ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN}; + + static unsigned FloatingPointVPOps[] = { + ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, + ISD::VP_FDIV, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD, + ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX}; if (!Subtarget.is64Bit()) { // We must custom-lower certain vXi64 operations on RV32 due to the vector @@ -452,6 +458,15 @@ setOperationAction(ISD::VECREDUCE_SMIN, MVT::i64, Custom); setOperationAction(ISD::VECREDUCE_UMAX, MVT::i64, Custom); setOperationAction(ISD::VECREDUCE_UMIN, MVT::i64, Custom); + + setOperationAction(ISD::VP_REDUCE_ADD, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_AND, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_OR, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_XOR, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_SMAX, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_SMIN, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_UMAX, MVT::i64, Custom); + setOperationAction(ISD::VP_REDUCE_UMIN, MVT::i64, Custom); } for (MVT VT : BoolVecVTs) { @@ -474,6 +489,10 @@ setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VP_REDUCE_AND, VT, Custom); + setOperationAction(ISD::VP_REDUCE_OR, VT, Custom); + setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom); + // RVV has native int->float & float->int conversions where the // element type sizes are within one power-of-two of each other. Any // wider distances between type sizes have to be lowered as sequences @@ -608,6 +627,7 @@ setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Legal); setOperationAction(ISD::LOAD, VT, Custom); @@ -698,6 +718,10 @@ setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VP_REDUCE_AND, VT, Custom); + setOperationAction(ISD::VP_REDUCE_OR, VT, Custom); + setOperationAction(ISD::VP_REDUCE_XOR, VT, Custom); + setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); @@ -2568,13 +2592,29 @@ case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i1) - return lowerVectorMaskVECREDUCE(Op, DAG); + return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ false); return lowerVECREDUCE(Op, DAG); case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: return lowerFPVECREDUCE(Op, DAG); + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_UMIN: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_FADD: + case ISD::VP_REDUCE_SEQ_FADD: + case ISD::VP_REDUCE_FMIN: + case ISD::VP_REDUCE_FMAX: + return lowerVPREDUCE(Op, DAG); + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + if (Op.getOperand(1).getValueType().getVectorElementType() == MVT::i1) + return lowerVectorMaskVecReduction(Op, DAG, /*IsVP*/ true); + return lowerVPREDUCE(Op, DAG); case ISD::INSERT_SUBVECTOR: return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: @@ -3823,14 +3863,18 @@ } } -SDValue RISCVTargetLowering::lowerVectorMaskVECREDUCE(SDValue Op, - SelectionDAG &DAG) const { +SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op, + SelectionDAG &DAG, + bool IsVP) const { SDLoc DL(Op); - SDValue Vec = Op.getOperand(0); + SDValue Vec = Op.getOperand(IsVP ? 1 : 0); MVT VecVT = Vec.getSimpleValueType(); assert((Op.getOpcode() == ISD::VECREDUCE_AND || Op.getOpcode() == ISD::VECREDUCE_OR || - Op.getOpcode() == ISD::VECREDUCE_XOR) && + Op.getOpcode() == ISD::VECREDUCE_XOR || + Op.getOpcode() == ISD::VP_REDUCE_AND || + Op.getOpcode() == ISD::VP_REDUCE_OR || + Op.getOpcode() == ISD::VP_REDUCE_XOR) && "Unexpected reduction lowering"); MVT XLenVT = Subtarget.getXLenVT(); @@ -3844,29 +3888,62 @@ } SDValue Mask, VL; - std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); + if (IsVP) { + Mask = Op.getOperand(2); + VL = Op.getOperand(3); + } else { + std::tie(Mask, VL) = + getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); + } + + ISD::CondCode CC; + unsigned BaseOpc = 0; SDValue Zero = DAG.getConstant(0, DL, XLenVT); switch (Op.getOpcode()) { default: llvm_unreachable("Unhandled reduction"); case ISD::VECREDUCE_AND: + case ISD::VP_REDUCE_AND: { // vpopc ~x == 0 - Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, Mask, VL); + SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL); + Vec = DAG.getNode(RISCVISD::VMXOR_VL, DL, ContainerVT, Vec, TrueMask, VL); Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL); - return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETEQ); + CC = ISD::SETEQ; + BaseOpc = ISD::AND; + break; + } case ISD::VECREDUCE_OR: + case ISD::VP_REDUCE_OR: // vpopc x != 0 Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL); - return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE); - case ISD::VECREDUCE_XOR: { + CC = ISD::SETNE; + BaseOpc = ISD::OR; + break; + case ISD::VECREDUCE_XOR: + case ISD::VP_REDUCE_XOR: { // ((vpopc x) & 1) != 0 SDValue One = DAG.getConstant(1, DL, XLenVT); Vec = DAG.getNode(RISCVISD::VPOPC_VL, DL, XLenVT, Vec, Mask, VL); Vec = DAG.getNode(ISD::AND, DL, XLenVT, Vec, One); - return DAG.getSetCC(DL, XLenVT, Vec, Zero, ISD::SETNE); + CC = ISD::SETNE; + BaseOpc = ISD::XOR; + break; } } + + SDValue SetCC = DAG.getSetCC(DL, XLenVT, Vec, Zero, CC); + + if (!IsVP) + return SetCC; + + // Now include the start value in the operation. + // Note that we must return the start value when no elements are operated + // upon. The vpopc instructions we've emitted in each case above will return + // 0 for an inactive vector, and so we've already received the neutral value: + // AND gives us (0 == 0 -> 1) and OR/XOR give us (0 != 0) -> 1. Therefore we + // can simply include the start value. + return DAG.getNode(BaseOpc, DL, XLenVT, SetCC, Op.getOperand(0)); } SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op, @@ -3912,8 +3989,8 @@ SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags()); SDValue IdentitySplat = DAG.getSplatVector(M1VT, DL, NeutralElem); - SDValue Reduction = - DAG.getNode(RVVOpcode, DL, M1VT, Vec, IdentitySplat, Mask, VL); + SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec, + IdentitySplat, Mask, VL); SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction, DAG.getConstant(0, DL, Subtarget.getXLenVT())); return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType()); @@ -3971,12 +4048,83 @@ // FIXME: This is a VLMAX splat which might be too large and can prevent // vsetvli removal. SDValue ScalarSplat = DAG.getSplatVector(M1VT, DL, ScalarVal); - SDValue Reduction = - DAG.getNode(RVVOpcode, DL, M1VT, VectorVal, ScalarSplat, Mask, VL); + SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), + VectorVal, ScalarSplat, Mask, VL); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction, DAG.getConstant(0, DL, Subtarget.getXLenVT())); } +static unsigned getRVVVPReductionOp(unsigned ISDOpcode) { + switch (ISDOpcode) { + default: + llvm_unreachable("Unhandled reduction"); + case ISD::VP_REDUCE_ADD: + return RISCVISD::VECREDUCE_ADD_VL; + case ISD::VP_REDUCE_UMAX: + return RISCVISD::VECREDUCE_UMAX_VL; + case ISD::VP_REDUCE_SMAX: + return RISCVISD::VECREDUCE_SMAX_VL; + case ISD::VP_REDUCE_UMIN: + return RISCVISD::VECREDUCE_UMIN_VL; + case ISD::VP_REDUCE_SMIN: + return RISCVISD::VECREDUCE_SMIN_VL; + case ISD::VP_REDUCE_AND: + return RISCVISD::VECREDUCE_AND_VL; + case ISD::VP_REDUCE_OR: + return RISCVISD::VECREDUCE_OR_VL; + case ISD::VP_REDUCE_XOR: + return RISCVISD::VECREDUCE_XOR_VL; + case ISD::VP_REDUCE_FADD: + return RISCVISD::VECREDUCE_FADD_VL; + case ISD::VP_REDUCE_SEQ_FADD: + return RISCVISD::VECREDUCE_SEQ_FADD_VL; + case ISD::VP_REDUCE_FMAX: + return RISCVISD::VECREDUCE_FMAX_VL; + case ISD::VP_REDUCE_FMIN: + return RISCVISD::VECREDUCE_FMIN_VL; + } +} + +SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vec = Op.getOperand(1); + EVT VecEVT = Vec.getValueType(); + + // TODO: The type may need to be widened rather than split. Or widened before + // it can be split. + if (!isTypeLegal(VecEVT)) + return SDValue(); + + MVT VecVT = VecEVT.getSimpleVT(); + MVT VecEltVT = VecVT.getVectorElementType(); + unsigned RVVOpcode = getRVVVPReductionOp(Op.getOpcode()); + + MVT ContainerVT = VecVT; + if (VecVT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VecVT); + Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget); + } + + SDValue VL = Op.getOperand(3); + SDValue Mask = Op.getOperand(2); + + MVT M1VT = getLMUL1VT(ContainerVT); + MVT XLenVT = Subtarget.getXLenVT(); + MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT; + + // FIXME: This is a VLMAX splat which might be too large and can prevent + // vsetvli removal. + SDValue StartSplat = DAG.getSplatVector(M1VT, DL, Op.getOperand(0)); + SDValue Reduction = + DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL); + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction, + DAG.getConstant(0, DL, Subtarget.getXLenVT())); + if (!VecVT.isInteger()) + return Elt0; + return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType()); +} + SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); @@ -5434,6 +5582,17 @@ if (SDValue V = lowerVECREDUCE(SDValue(N, 0), DAG)) Results.push_back(V); break; + case ISD::VP_REDUCE_ADD: + case ISD::VP_REDUCE_AND: + case ISD::VP_REDUCE_OR: + case ISD::VP_REDUCE_XOR: + case ISD::VP_REDUCE_SMAX: + case ISD::VP_REDUCE_UMAX: + case ISD::VP_REDUCE_SMIN: + case ISD::VP_REDUCE_UMIN: + if (SDValue V = lowerVPREDUCE(SDValue(N, 0), DAG)) + Results.push_back(V); + break; case ISD::FLT_ROUNDS_: { SDVTList VTs = DAG.getVTList(Subtarget.getXLenVT(), MVT::Other); SDValue Res = DAG.getNode(ISD::FLT_ROUNDS_, DL, VTs, N->getOperand(0)); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -230,9 +230,9 @@ def riscv_vwmul_vl : SDNode<"RISCVISD::VWMUL_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>; def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>; -def SDTRVVVecReduce : SDTypeProfile<1, 4, [ - SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, - SDTCisSameNumEltsAs<1, 3>, SDTCisVT<4, XLenVT> +def SDTRVVVecReduce : SDTypeProfile<1, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>, + SDTCVecEltisVT<4, i1>, SDTCisSameNumEltsAs<2, 4>, SDTCisVT<5, XLenVT> ]>; def riscv_mul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D), @@ -589,14 +589,23 @@ multiclass VPatReductionVL { foreach vti = !if(is_float, AllFloatVectors, AllIntegerVectors) in { defvar vti_m1 = !cast(!if(is_float, "VF", "VI") # vti.SEW # "M1"); - def: Pat<(vti_m1.Vector (vop (vti.Vector vti.RegClass:$rs1), VR:$rs2, + def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), VR:$rs2, (vti.Mask true_mask), VLOpFrag)), (!cast(instruction_name#"_VS_"#vti.LMul.MX) - (vti_m1.Vector (IMPLICIT_DEF)), + (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), (vti_m1.Vector VR:$rs2), GPR:$vl, vti.Log2SEW)>; + + def: Pat<(vti_m1.Vector (vop (vti_m1.Vector VR:$merge), (vti.Vector vti.RegClass:$rs1), VR:$rs2, + (vti.Mask VMV0:$vm), + VLOpFrag)), + (!cast(instruction_name#"_VS_"#vti.LMul.MX#"_MASK") + (vti_m1.Vector VR:$merge), + (vti.Vector vti.RegClass:$rs1), + (vti_m1.Vector VR:$rs2), + VMV0:$vm, GPR:$vl, vti.Log2SEW)>; } } @@ -1271,6 +1280,10 @@ VLOpFrag)), (!cast("PseudoVPOPC_M_" # mti.BX) VR:$rs2, GPR:$vl, mti.Log2SEW)>; + def : Pat<(XLenVT (riscv_vpopc_vl (mti.Mask VR:$rs2), (mti.Mask VMV0:$vm), + VLOpFrag)), + (!cast("PseudoVPOPC_M_" # mti.BX # "_MASK") + VR:$rs2, VMV0:$vm, GPR:$vl, mti.Log2SEW)>; } } // Predicates = [HasStdExtV] diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +declare half @llvm.vp.reduce.fadd.v2f16(half, <2 x half>, <2 x i1>, i32) + +define half @vpreduce_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc half @llvm.vp.reduce.fadd.v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 %evl) + ret half %r +} + +define half @vpreduce_ord_fadd_v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call half @llvm.vp.reduce.fadd.v2f16(half %s, <2 x half> %v, <2 x i1> %m, i32 %evl) + ret half %r +} + +declare half @llvm.vp.reduce.fadd.v4f16(half, <4 x half>, <4 x i1>, i32) + +define half @vpreduce_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc half @llvm.vp.reduce.fadd.v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 %evl) + ret half %r +} + +define half @vpreduce_ord_fadd_v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call half @llvm.vp.reduce.fadd.v4f16(half %s, <4 x half> %v, <4 x i1> %m, i32 %evl) + ret half %r +} + +declare float @llvm.vp.reduce.fadd.v2f32(float, <2 x float>, <2 x i1>, i32) + +define float @vpreduce_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc float @llvm.vp.reduce.fadd.v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 %evl) + ret float %r +} + +define float @vpreduce_ord_fadd_v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call float @llvm.vp.reduce.fadd.v2f32(float %s, <2 x float> %v, <2 x i1> %m, i32 %evl) + ret float %r +} + +declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32) + +define float @vpreduce_fadd_v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 %evl) + ret float %r +} + +define float @vpreduce_ord_fadd_v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call float @llvm.vp.reduce.fadd.v4f32(float %s, <4 x float> %v, <4 x i1> %m, i32 %evl) + ret float %r +} + +declare double @llvm.vp.reduce.fadd.v2f64(double, <2 x double>, <2 x i1>, i32) + +define double @vpreduce_fadd_v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc double @llvm.vp.reduce.fadd.v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 %evl) + ret double %r +} + +define double @vpreduce_ord_fadd_v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call double @llvm.vp.reduce.fadd.v2f64(double %s, <2 x double> %v, <2 x i1> %m, i32 %evl) + ret double %r +} + +declare double @llvm.vp.reduce.fadd.v4f64(double, <4 x double>, <4 x i1>, i32) + +define double @vpreduce_fadd_v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc double @llvm.vp.reduce.fadd.v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 %evl) + ret double %r +} + +define double @vpreduce_ord_fadd_v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call double @llvm.vp.reduce.fadd.v4f64(double %s, <4 x double> %v, <4 x i1> %m, i32 %evl) + ret double %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -0,0 +1,1377 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare i8 @llvm.vp.reduce.add.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_add_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.add.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umax.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_umax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umax_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smax.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_smax_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smax.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umin.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_umin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umin_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smin.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_smin_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smin.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.and.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_and_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.and.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.or.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_or_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.or.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.xor.v2i8(i8, <2 x i8>, <2 x i1>, i32) + +define signext i8 @vpreduce_xor_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.xor.v2i8(i8 %s, <2 x i8> %v, <2 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.add.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_add_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.add.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umax.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_umax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umax_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smax.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_smax_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smax.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umin.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_umin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umin_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smin.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_smin_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smin.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.and.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_and_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.and.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.or.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_or_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.or.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.xor.v4i8(i8, <4 x i8>, <4 x i1>, i32) + +define signext i8 @vpreduce_xor_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.xor.v4i8(i8 %s, <4 x i8> %v, <4 x i1> %m, i32 %evl) + ret i8 %r +} + +declare i16 @llvm.vp.reduce.add.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_add_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.add.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umax.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_umax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smax.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_smax_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smax.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umin.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_umin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_v2i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_v2i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smin.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_smin_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smin.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.and.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_and_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.and.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.or.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_or_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.or.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.xor.v2i16(i16, <2 x i16>, <2 x i1>, i32) + +define signext i16 @vpreduce_xor_v2i16(i16 signext %s, <2 x i16> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.xor.v2i16(i16 %s, <2 x i16> %v, <2 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.add.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_add_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.add.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umax.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_umax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smax.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_smax_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smax.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umin.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_umin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_v4i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_v4i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smin.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_smin_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smin.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.and.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_and_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.and.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.or.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_or_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.or.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.xor.v4i16(i16, <4 x i16>, <4 x i1>, i32) + +define signext i16 @vpreduce_xor_v4i16(i16 signext %s, <4 x i16> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.xor.v4i16(i16 %s, <4 x i16> %v, <4 x i1> %m, i32 %evl) + ret i16 %r +} + +declare i32 @llvm.vp.reduce.add.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_add_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.add.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umax.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_umax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smax.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_smax_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smax.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umin.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_umin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smin.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_smin_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smin.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.and.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_and_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.and.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.or.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_or_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.or.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.xor.v2i32(i32, <2 x i32>, <2 x i1>, i32) + +define signext i32 @vpreduce_xor_v2i32(i32 signext %s, <2 x i32> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.xor.v2i32(i32 %s, <2 x i32> %v, <2 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_add_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_umax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_smax_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smax.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_umin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_smin_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smin.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_and_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.and.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_or_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.or.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32) + +define signext i32 @vpreduce_xor_v4i32(i32 signext %s, <4 x i32> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.xor.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) + ret i32 %r +} + +declare i64 @llvm.vp.reduce.add.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_add_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_add_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_add_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.add.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umax.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_umax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umax.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smax.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_smax_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smax_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smax_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smax.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umin.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_umin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umin.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smin.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_smin_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smin_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smin_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smin.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.and.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_and_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_and_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredand.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_and_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredand.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.and.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.or.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_or_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_or_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_or_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.or.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.xor.v2i64(i64, <2 x i64>, <2 x i1>, i32) + +define signext i64 @vpreduce_xor_v2i64(i64 signext %s, <2 x i64> %v, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_xor_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_xor_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.xor.v2i64(i64 %s, <2 x i64> %v, <2 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.add.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_add_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_add_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_add_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.add.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umax.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_umax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umax.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smax.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_smax_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smax_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smax_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smax.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umin.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_umin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umin.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smin.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_smin_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smin_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smin_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smin.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.and.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_and_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_and_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredand.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_and_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredand.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.and.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.or.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_or_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_or_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_or_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.or.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.xor.v4i64(i64, <4 x i64>, <4 x i1>, i32) + +define signext i64 @vpreduce_xor_v4i64(i64 signext %s, <4 x i64> %v, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_xor_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_xor_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.xor.v4i64(i64 %s, <4 x i64> %v, <4 x i1> %m, i32 %evl) + ret i64 %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +declare i1 @llvm.vp.reduce.and.v1i1(i1, <1 x i1>, <1 x i1>, i32) + +define signext i1 @vpreduce_and_v1i1(i1 signext %s, <1 x i1> %v, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.v1i1(i1 %s, <1 x i1> %v, <1 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.v1i1(i1, <1 x i1>, <1 x i1>, i32) + +define signext i1 @vpreduce_or_v1i1(i1 signext %s, <1 x i1> %v, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.v1i1(i1 %s, <1 x i1> %v, <1 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.v1i1(i1, <1 x i1>, <1 x i1>, i32) + +define signext i1 @vpreduce_xor_v1i1(i1 signext %s, <1 x i1> %v, <1 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.v1i1(i1 %s, <1 x i1> %v, <1 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.v2i1(i1, <2 x i1>, <2 x i1>, i32) + +define signext i1 @vpreduce_and_v2i1(i1 signext %s, <2 x i1> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.v2i1(i1 %s, <2 x i1> %v, <2 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.v2i1(i1, <2 x i1>, <2 x i1>, i32) + +define signext i1 @vpreduce_or_v2i1(i1 signext %s, <2 x i1> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.v2i1(i1 %s, <2 x i1> %v, <2 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.v2i1(i1, <2 x i1>, <2 x i1>, i32) + +define signext i1 @vpreduce_xor_v2i1(i1 signext %s, <2 x i1> %v, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.v2i1(i1 %s, <2 x i1> %v, <2 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.v4i1(i1, <4 x i1>, <4 x i1>, i32) + +define signext i1 @vpreduce_and_v4i1(i1 signext %s, <4 x i1> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.v4i1(i1 %s, <4 x i1> %v, <4 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.v4i1(i1, <4 x i1>, <4 x i1>, i32) + +define signext i1 @vpreduce_or_v4i1(i1 signext %s, <4 x i1> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.v4i1(i1 %s, <4 x i1> %v, <4 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.v4i1(i1, <4 x i1>, <4 x i1>, i32) + +define signext i1 @vpreduce_xor_v4i1(i1 signext %s, <4 x i1> %v, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.v4i1(i1 %s, <4 x i1> %v, <4 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.v8i1(i1, <8 x i1>, <8 x i1>, i32) + +define signext i1 @vpreduce_and_v8i1(i1 signext %s, <8 x i1> %v, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.v8i1(i1 %s, <8 x i1> %v, <8 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.v8i1(i1, <8 x i1>, <8 x i1>, i32) + +define signext i1 @vpreduce_or_v8i1(i1 signext %s, <8 x i1> %v, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.v8i1(i1 %s, <8 x i1> %v, <8 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.v8i1(i1, <8 x i1>, <8 x i1>, i32) + +define signext i1 @vpreduce_xor_v8i1(i1 signext %s, <8 x i1> %v, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.v8i1(i1 %s, <8 x i1> %v, <8 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.v16i1(i1, <16 x i1>, <16 x i1>, i32) + +define signext i1 @vpreduce_and_v16i1(i1 signext %s, <16 x i1> %v, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.v16i1(i1 %s, <16 x i1> %v, <16 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.v16i1(i1, <16 x i1>, <16 x i1>, i32) + +define signext i1 @vpreduce_or_v16i1(i1 signext %s, <16 x i1> %v, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.v16i1(i1 %s, <16 x i1> %v, <16 x i1> %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.v16i1(i1, <16 x i1>, <16 x i1>, i32) + +define signext i1 @vpreduce_xor_v16i1(i1 signext %s, <16 x i1> %v, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_v16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.v16i1(i1 %s, <16 x i1> %v, <16 x i1> %m, i32 %evl) + ret i1 %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll @@ -0,0 +1,257 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s + +declare half @llvm.vp.reduce.fadd.nxv1f16(half, , , i32) + +define half @vpreduce_fadd_nxv1f16(half %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc half @llvm.vp.reduce.fadd.nxv1f16(half %s, %v, %m, i32 %evl) + ret half %r +} + +define half @vpreduce_ord_fadd_nxv1f16(half %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv1f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call half @llvm.vp.reduce.fadd.nxv1f16(half %s, %v, %m, i32 %evl) + ret half %r +} + +declare half @llvm.vp.reduce.fadd.nxv2f16(half, , , i32) + +define half @vpreduce_fadd_nxv2f16(half %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc half @llvm.vp.reduce.fadd.nxv2f16(half %s, %v, %m, i32 %evl) + ret half %r +} + +define half @vpreduce_ord_fadd_nxv2f16(half %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call half @llvm.vp.reduce.fadd.nxv2f16(half %s, %v, %m, i32 %evl) + ret half %r +} + +declare half @llvm.vp.reduce.fadd.nxv4f16(half, , , i32) + +define half @vpreduce_fadd_nxv4f16(half %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc half @llvm.vp.reduce.fadd.nxv4f16(half %s, %v, %m, i32 %evl) + ret half %r +} + +define half @vpreduce_ord_fadd_nxv4f16(half %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call half @llvm.vp.reduce.fadd.nxv4f16(half %s, %v, %m, i32 %evl) + ret half %r +} + +declare float @llvm.vp.reduce.fadd.nxv1f32(float, , , i32) + +define float @vpreduce_fadd_nxv1f32(float %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc float @llvm.vp.reduce.fadd.nxv1f32(float %s, %v, %m, i32 %evl) + ret float %r +} + +define float @vpreduce_ord_fadd_nxv1f32(float %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call float @llvm.vp.reduce.fadd.nxv1f32(float %s, %v, %m, i32 %evl) + ret float %r +} + +declare float @llvm.vp.reduce.fadd.nxv2f32(float, , , i32) + +define float @vpreduce_fadd_nxv2f32(float %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc float @llvm.vp.reduce.fadd.nxv2f32(float %s, %v, %m, i32 %evl) + ret float %r +} + +define float @vpreduce_ord_fadd_nxv2f32(float %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call float @llvm.vp.reduce.fadd.nxv2f32(float %s, %v, %m, i32 %evl) + ret float %r +} + +declare float @llvm.vp.reduce.fadd.nxv4f32(float, , , i32) + +define float @vpreduce_fadd_nxv4f32(float %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc float @llvm.vp.reduce.fadd.nxv4f32(float %s, %v, %m, i32 %evl) + ret float %r +} + +define float @vpreduce_ord_fadd_nxv4f32(float %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call float @llvm.vp.reduce.fadd.nxv4f32(float %s, %v, %m, i32 %evl) + ret float %r +} + +declare double @llvm.vp.reduce.fadd.nxv1f64(double, , , i32) + +define double @vpreduce_fadd_nxv1f64(double %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc double @llvm.vp.reduce.fadd.nxv1f64(double %s, %v, %m, i32 %evl) + ret double %r +} + +define double @vpreduce_ord_fadd_nxv1f64(double %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv1f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call double @llvm.vp.reduce.fadd.nxv1f64(double %s, %v, %m, i32 %evl) + ret double %r +} + +declare double @llvm.vp.reduce.fadd.nxv2f64(double, , , i32) + +define double @vpreduce_fadd_nxv2f64(double %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc double @llvm.vp.reduce.fadd.nxv2f64(double %s, %v, %m, i32 %evl) + ret double %r +} + +define double @vpreduce_ord_fadd_nxv2f64(double %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call double @llvm.vp.reduce.fadd.nxv2f64(double %s, %v, %m, i32 %evl) + ret double %r +} + +declare double @llvm.vp.reduce.fadd.nxv4f64(double, , , i32) + +define double @vpreduce_fadd_nxv4f64(double %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_fadd_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu +; CHECK-NEXT: vfredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call reassoc double @llvm.vp.reduce.fadd.nxv4f64(double %s, %v, %m, i32 %evl) + ret double %r +} + +define double @vpreduce_ord_fadd_nxv4f64(double %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_ord_fadd_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, mu +; CHECK-NEXT: vfmv.v.f v25, fa0 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu +; CHECK-NEXT: vfredosum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vfmv.f.s fa0, v25 +; CHECK-NEXT: ret + %r = call double @llvm.vp.reduce.fadd.nxv4f64(double %s, %v, %m, i32 %evl) + ret double %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll @@ -0,0 +1,2063 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +declare i8 @llvm.vp.reduce.add.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_add_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.add.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umax.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_umax_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umax_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umax.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smax.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_smax_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smax.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umin.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_umin_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umin_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umin.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smin.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_smin_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smin.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.and.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_and_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.and.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.or.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_or_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.or.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.xor.nxv1i8(i8, , , i32) + +define signext i8 @vpreduce_xor_nxv1i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv1i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.xor.nxv1i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.add.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_add_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.add.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umax.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_umax_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umax_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umax.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smax.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_smax_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smax.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umin.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_umin_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umin_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umin.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smin.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_smin_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smin.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.and.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_and_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.and.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.or.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_or_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.or.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.xor.nxv2i8(i8, , , i32) + +define signext i8 @vpreduce_xor_nxv2i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.xor.nxv2i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.add.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_add_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.add.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umax.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_umax_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umax_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umax.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smax.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_smax_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smax.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.umin.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_umin_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_umin_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 255 +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredminu.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.umin.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.smin.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_smin_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.smin.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.and.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_and_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.and.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.or.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_or_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.or.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i8 @llvm.vp.reduce.xor.nxv4i8(i8, , , i32) + +define signext i8 @vpreduce_xor_nxv4i8(i8 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i8 @llvm.vp.reduce.xor.nxv4i8(i8 %s, %v, %m, i32 %evl) + ret i8 %r +} + +declare i16 @llvm.vp.reduce.add.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_add_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.add.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umax.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_umax_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv1i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv1i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umax.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smax.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_smax_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smax.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umin.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_umin_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv1i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv1i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umin.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smin.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_smin_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smin.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.and.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_and_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.and.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.or.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_or_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.or.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.xor.nxv1i16(i16, , , i32) + +define signext i16 @vpreduce_xor_nxv1i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.xor.nxv1i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.add.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_add_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.add.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umax.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_umax_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umax.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smax.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_smax_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smax.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umin.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_umin_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umin.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smin.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_smin_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smin.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.and.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_and_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.and.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.or.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_or_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.or.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.xor.nxv2i16(i16, , , i32) + +define signext i16 @vpreduce_xor_nxv2i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.xor.nxv2i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.add.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_add_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.add.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umax.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_umax_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umax.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smax.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_smax_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smax.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.umin.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_umin_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: lui a2, 16 +; RV64-NEXT: addiw a2, a2, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i16 @llvm.vp.reduce.umin.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.smin.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_smin_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.smin.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.and.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_and_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.and.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.or.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_or_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.or.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i16 @llvm.vp.reduce.xor.nxv4i16(i16, , , i32) + +define signext i16 @vpreduce_xor_nxv4i16(i16 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i16 @llvm.vp.reduce.xor.nxv4i16(i16 %s, %v, %m, i32 %evl) + ret i16 %r +} + +declare i32 @llvm.vp.reduce.add.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_add_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.add.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umax.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_umax_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umax.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smax.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_smax_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smax.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umin.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_umin_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umin.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smin.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_smin_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smin.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.and.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_and_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.and.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.or.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_or_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.or.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.xor.nxv1i32(i32, , , i32) + +define signext i32 @vpreduce_xor_nxv1i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv1i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.xor.nxv1i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.add.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_add_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.add.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umax.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_umax_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umax.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smax.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_smax_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smax.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umin.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_umin_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umin.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smin.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_smin_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smin.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.and.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_and_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.and.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.or.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_or_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.or.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.xor.nxv2i32(i32, , , i32) + +define signext i32 @vpreduce_xor_nxv2i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.xor.nxv2i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.add.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_add_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_add_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vredsum.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.add.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umax.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_umax_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smax.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_smax_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smax_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vredmax.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.umin.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_umin_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV32-NEXT: vmv.v.x v25, a0 +; RV32-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.smin.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_smin_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_smin_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vredmin.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.and.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_and_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vredand.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.and.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.or.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_or_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vredor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.or.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i32 @llvm.vp.reduce.xor.nxv4i32(i32, , , i32) + +define signext i32 @vpreduce_xor_nxv4i32(i32 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.x v25, a0 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vredxor.vs v25, v8, v25, v0.t +; CHECK-NEXT: vmv.x.s a0, v25 +; CHECK-NEXT: ret + %r = call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %s, %v, %m, i32 %evl) + ret i32 %r +} + +declare i64 @llvm.vp.reduce.add.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_add_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_add_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_add_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.add.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umax.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_umax_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umax.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smax.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_smax_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smax_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smax_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smax.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umin.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_umin_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umin.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smin.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_smin_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smin_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smin_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smin.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.and.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_and_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_and_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredand.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_and_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredand.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.and.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.or.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_or_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_or_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_or_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.or.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.xor.nxv1i64(i64, , , i32) + +define signext i64 @vpreduce_xor_nxv1i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_xor_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu +; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_xor_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu +; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.xor.nxv1i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.add.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_add_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_add_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_add_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.add.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umax.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_umax_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umax.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smax.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_smax_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smax_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smax_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smax.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umin.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_umin_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umin.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smin.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_smin_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smin_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smin_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smin.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.and.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_and_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_and_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredand.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_and_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredand.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.and.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.or.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_or_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_or_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_or_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.or.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.xor.nxv2i64(i64, , , i32) + +define signext i64 @vpreduce_xor_nxv2i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_xor_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu +; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_xor_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu +; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.xor.nxv2i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.add.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_add_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_add_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_add_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredsum.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.add.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umax.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_umax_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umax_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umax_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredmaxu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umax.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smax.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_smax_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smax_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smax_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredmax.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smax.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.umin.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_umin_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_umin_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_umin_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredminu.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.umin.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.smin.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_smin_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_smin_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_smin_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredmin.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.smin.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.and.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_and_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_and_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredand.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_and_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredand.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.and.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.or.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_or_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_or_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_or_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.or.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} + +declare i64 @llvm.vp.reduce.xor.nxv4i64(i64, , , i32) + +define signext i64 @vpreduce_xor_nxv4i64(i64 signext %s, %v, %m, i32 zeroext %evl) { +; RV32-LABEL: vpreduce_xor_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v25, (a0), zero +; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu +; RV32-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV32-NEXT: vmv.x.s a0, v25 +; RV32-NEXT: addi a1, zero, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vsrl.vx v25, v25, a1 +; RV32-NEXT: vmv.x.s a1, v25 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vpreduce_xor_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a2, zero, e64, m1, ta, mu +; RV64-NEXT: vmv.v.x v25, a0 +; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu +; RV64-NEXT: vredxor.vs v25, v8, v25, v0.t +; RV64-NEXT: vmv.x.s a0, v25 +; RV64-NEXT: ret + %r = call i64 @llvm.vp.reduce.xor.nxv4i64(i64 %s, %v, %m, i32 %evl) + ret i64 %r +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll @@ -0,0 +1,367 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s + +declare i1 @llvm.vp.reduce.and.nxv1i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv1i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv1i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv1i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv1i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv1i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv1i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv1i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv1i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv1i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.nxv2i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv2i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv2i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv2i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv2i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv2i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv2i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv2i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv2i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv2i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.nxv4i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv4i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv4i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv4i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv4i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv4i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv4i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv4i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv4i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv4i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.nxv8i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv8i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv8i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv8i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv8i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv8i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv8i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv8i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv8i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.nxv16i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv16i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv16i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv16i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv16i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv16i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv16i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv16i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv16i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.nxv32i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv32i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv32i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv32i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv32i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv32i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv32i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv32i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv32i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.and.nxv64i1(i1, , , i32) + +define signext i1 @vpreduce_and_nxv64i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_and_nxv64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vmnand.mm v25, v0, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: seqz a1, a1 +; CHECK-NEXT: and a0, a1, a0 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.and.nxv64i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.or.nxv64i1(i1, , , i32) + +define signext i1 @vpreduce_or_nxv64i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_or_nxv64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.or.nxv64i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +} + +declare i1 @llvm.vp.reduce.xor.nxv64i1(i1, , , i32) + +define signext i1 @vpreduce_xor_nxv64i1(i1 signext %s, %v, %m, i32 zeroext %evl) { +; CHECK-LABEL: vpreduce_xor_nxv64i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v25, v0 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vpopc.m a1, v25, v0.t +; CHECK-NEXT: xor a0, a1, a0 +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: neg a0, a0 +; CHECK-NEXT: ret + %r = call i1 @llvm.vp.reduce.xor.nxv64i1(i1 %s, %v, %m, i32 %evl) + ret i1 %r +}