Index: llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h +++ llvm/trunk/include/llvm/CodeGen/ISDOpcodes.h @@ -872,11 +872,14 @@ VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL, /// These reductions are non-strict, and have a single vector operand. VECREDUCE_FADD, VECREDUCE_FMUL, + /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants. + VECREDUCE_FMAX, VECREDUCE_FMIN, + /// Integer reductions may have a result type larger than the vector element + /// type. However, the reduction is performed using the vector element type + /// and the value in the top bits is unspecified. VECREDUCE_ADD, VECREDUCE_MUL, VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR, VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN, - /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants. - VECREDUCE_FMAX, VECREDUCE_FMIN, /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -3893,6 +3893,10 @@ bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow, SelectionDAG &DAG) const; + /// Expand a VECREDUCE_* into an explicit calculation. If Count is specified, + /// only the first Count elements of the vector are used. + SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -398,6 +398,7 @@ SDValue visitMSCATTER(SDNode *N); SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); + SDValue visitVECREDUCE(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); @@ -1592,6 +1593,19 @@ case ISD::MSTORE: return visitMSTORE(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N); } return SDValue(); } @@ -18307,6 +18321,24 @@ return SDValue(); } +SDValue DAGCombiner::visitVECREDUCE(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType(); + + // VECREDUCE over 1-element vector is just an extract. + if (VT.getVectorNumElements() == 1) { + SDLoc dl(N); + SDValue Res = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, + DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + if (Res.getValueType() != N->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res); + return Res; + } + + return SDValue(); +} + /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1140,6 +1140,22 @@ Action = TLI.getOperationAction(Node->getOpcode(), cast(Node)->getValue().getValueType()); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Action = TLI.getOperationAction( + Node->getOpcode(), Node->getOperand(0).getValueType()); + break; default: if (Node->getOpcode() >= ISD::BUILTIN_OP_END) { Action = TargetLowering::Legal; @@ -3602,6 +3618,21 @@ ReplaceNode(SDValue(Node, 0), Result); break; } + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Results.push_back(TLI.expandVecReduce(Node, DAG)); + break; case ISD::GLOBAL_OFFSET_TABLE: case ISD::GlobalAddress: case ISD::GlobalTLSAddress: Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -172,6 +172,18 @@ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: Res = PromoteIntRes_AtomicCmpSwap(cast(N), ResNo); break; + + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + Res = PromoteIntRes_VECREDUCE(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -1107,6 +1119,16 @@ case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break; case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break; + + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: Res = PromoteIntOp_VECREDUCE(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1483,6 +1505,39 @@ return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_VECREDUCE(SDNode *N) { + SDLoc dl(N); + SDValue Op; + switch (N->getOpcode()) { + default: llvm_unreachable("Expected integer vector reduction"); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + Op = GetPromotedInteger(N->getOperand(0)); + break; + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + Op = SExtPromotedInteger(N->getOperand(0)); + break; + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + Op = ZExtPromotedInteger(N->getOperand(0)); + break; + } + + EVT EltVT = Op.getValueType().getVectorElementType(); + EVT VT = N->getValueType(0); + if (VT.bitsGE(EltVT)) + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op); + + // Result size must be >= element size. If this is not the case after + // promotion, also promote the result type and then truncate. + SDValue Reduce = DAG.getNode(N->getOpcode(), dl, EltVT, Op); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Reduce); +} + //===----------------------------------------------------------------------===// // Integer Result Expansion //===----------------------------------------------------------------------===// @@ -1624,6 +1679,16 @@ case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break; case ISD::SMULFIX: case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break; + + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: ExpandIntRes_VECREDUCE(N, Lo, Hi); break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -3172,6 +3237,14 @@ ReplaceValueWith(SDValue(N, 1), Swap.getValue(2)); } +void DAGTypeLegalizer::ExpandIntRes_VECREDUCE(SDNode *N, + SDValue &Lo, SDValue &Hi) { + // TODO For VECREDUCE_(AND|OR|XOR) we could split the vector and calculate + // both halves independently. + SDValue Res = TLI.expandVecReduce(N, DAG); + SplitInteger(Res, Lo, Hi); +} + //===----------------------------------------------------------------------===// // Integer Operand Expansion //===----------------------------------------------------------------------===// @@ -3840,6 +3913,14 @@ V0, ConvElem, N->getOperand(2)); } +SDValue DAGTypeLegalizer::PromoteIntRes_VECREDUCE(SDNode *N) { + // The VECREDUCE result size may be larger than the element size, so + // we can simply change the result type. + SDLoc dl(N); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); +} + SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) { SDLoc dl(N); SDValue V0 = GetPromotedInteger(N->getOperand(0)); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -346,6 +346,7 @@ SDValue PromoteIntRes_ADDSUBSAT(SDNode *N); SDValue PromoteIntRes_MULFIX(SDNode *N); SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N); + SDValue PromoteIntRes_VECREDUCE(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -380,6 +381,7 @@ SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_MULFIX(SDNode *N); SDValue PromoteIntOp_FPOWI(SDNode *N); + SDValue PromoteIntOp_VECREDUCE(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -438,6 +440,7 @@ void ExpandIntRes_MULFIX (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandIntRes_VECREDUCE (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandShiftByConstant(SDNode *N, const APInt &Amt, SDValue &Lo, SDValue &Hi); @@ -705,6 +708,7 @@ SDValue ScalarizeVecOp_VSETCC(SDNode *N); SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo); SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo); + SDValue ScalarizeVecOp_VECREDUCE(SDNode *N); //===--------------------------------------------------------------------===// // Vector Splitting Support: LegalizeVectorTypes.cpp @@ -835,6 +839,7 @@ SDValue WidenVecOp_Convert(SDNode *N); SDValue WidenVecOp_FCOPYSIGN(SDNode *N); + SDValue WidenVecOp_VECREDUCE(SDNode *N); //===--------------------------------------------------------------------===// // Vector Widening Utilities Support: LegalizeVectorTypes.cpp Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -294,12 +294,13 @@ } } - bool HasVectorValue = false; - for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end(); - J != E; - ++J) - HasVectorValue |= J->isVector(); - if (!HasVectorValue) + bool HasVectorValueOrOp = false; + for (auto J = Node->value_begin(), E = Node->value_end(); J != E; ++J) + HasVectorValueOrOp |= J->isVector(); + for (const SDValue &Op : Node->op_values()) + HasVectorValueOrOp |= Op.getValueType().isVector(); + + if (!HasVectorValueOrOp) return TranslateLegalizeResults(Op, Result); TargetLowering::LegalizeAction Action = TargetLowering::Legal; @@ -441,6 +442,19 @@ break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); break; @@ -816,6 +830,20 @@ case ISD::STRICT_FROUND: case ISD::STRICT_FTRUNC: return ExpandStrictFPOp(Op); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + return TLI.expandVecReduce(Op.getNode(), DAG); default: return DAG.UnrollVectorOp(Op.getNode()); } Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -606,6 +606,21 @@ case ISD::FP_ROUND: Res = ScalarizeVecOp_FP_ROUND(N, OpNo); break; + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Res = ScalarizeVecOp_VECREDUCE(N); + break; } } @@ -736,6 +751,14 @@ return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res); } +SDValue DAGTypeLegalizer::ScalarizeVecOp_VECREDUCE(SDNode *N) { + SDValue Res = GetScalarizedVector(N->getOperand(0)); + // Result type may be wider than element type. + if (Res.getValueType() != N->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Res); + return Res; +} + //===----------------------------------------------------------------------===// // Result Vector Splitting //===----------------------------------------------------------------------===// @@ -3868,6 +3891,22 @@ case ISD::TRUNCATE: Res = WidenVecOp_Convert(N); break; + + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Res = WidenVecOp_VECREDUCE(N); + break; } // If Res is null, the sub-method took care of registering the result. @@ -4216,6 +4255,62 @@ return PromoteTargetBoolean(CC, VT); } +SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { + SDLoc dl(N); + SDValue Op = GetWidenedVector(N->getOperand(0)); + EVT OrigVT = N->getOperand(0).getValueType(); + EVT WideVT = Op.getValueType(); + EVT ElemVT = OrigVT.getVectorElementType(); + + SDValue NeutralElem; + switch (N->getOpcode()) { + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_UMAX: + NeutralElem = DAG.getConstant(0, dl, ElemVT); + break; + case ISD::VECREDUCE_MUL: + NeutralElem = DAG.getConstant(1, dl, ElemVT); + break; + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_UMIN: + NeutralElem = DAG.getAllOnesConstant(dl, ElemVT); + break; + case ISD::VECREDUCE_SMAX: + NeutralElem = DAG.getConstant( + APInt::getSignedMinValue(ElemVT.getSizeInBits()), dl, ElemVT); + break; + case ISD::VECREDUCE_SMIN: + NeutralElem = DAG.getConstant( + APInt::getSignedMaxValue(ElemVT.getSizeInBits()), dl, ElemVT); + break; + case ISD::VECREDUCE_FADD: + NeutralElem = DAG.getConstantFP(0.0, dl, ElemVT); + break; + case ISD::VECREDUCE_FMUL: + NeutralElem = DAG.getConstantFP(1.0, dl, ElemVT); + break; + case ISD::VECREDUCE_FMAX: + NeutralElem = DAG.getConstantFP( + std::numeric_limits::infinity(), dl, ElemVT); + break; + case ISD::VECREDUCE_FMIN: + NeutralElem = DAG.getConstantFP( + -std::numeric_limits::infinity(), dl, ElemVT); + break; + } + + // Pad the vector with the neutral element. + unsigned OrigElts = OrigVT.getVectorNumElements(); + unsigned WideElts = WideVT.getVectorNumElements(); + for (unsigned Idx = OrigElts; Idx < WideElts; Idx++) + Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem, + DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + + return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags()); +} + //===----------------------------------------------------------------------===// // Vector Widening Utilities Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5617,3 +5617,61 @@ "Unexpected result type for S/UMULO legalization"); return true; } + +SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { + SDLoc dl(Node); + bool NoNaN = Node->getFlags().hasNoNaNs(); + unsigned BaseOpcode = 0; + switch (Node->getOpcode()) { + default: llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; + case ISD::VECREDUCE_ADD: BaseOpcode = ISD::ADD; break; + case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; + case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; + case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; + case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; + case ISD::VECREDUCE_SMAX: BaseOpcode = ISD::SMAX; break; + case ISD::VECREDUCE_SMIN: BaseOpcode = ISD::SMIN; break; + case ISD::VECREDUCE_UMAX: BaseOpcode = ISD::UMAX; break; + case ISD::VECREDUCE_UMIN: BaseOpcode = ISD::UMIN; break; + case ISD::VECREDUCE_FMAX: + BaseOpcode = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM; + break; + case ISD::VECREDUCE_FMIN: + BaseOpcode = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM; + break; + } + + SDValue Op = Node->getOperand(0); + EVT VT = Op.getValueType(); + + // Try to use a shuffle reduction for power of two vectors. + if (VT.isPow2VectorType()) { + while (VT.getVectorNumElements() > 1) { + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + if (!isOperationLegalOrCustom(BaseOpcode, HalfVT)) + break; + + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op, dl); + Op = DAG.getNode(BaseOpcode, dl, HalfVT, Lo, Hi); + VT = HalfVT; + } + } + + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector Ops; + DAG.ExtractVectorElements(Op, Ops, 0, NumElts); + + SDValue Res = Ops[0]; + for (unsigned i = 1; i < NumElts; i++) + Res = DAG.getNode(BaseOpcode, dl, EltVT, Res, Ops[i], Node->getFlags()); + + // Result type may be wider than element type. + if (EltVT != Node->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, Node->getValueType(0), Res); + return Res; +} Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp @@ -665,6 +665,21 @@ // For most targets @llvm.get.dynamic.area.offset just returns 0. setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand); + + // Vector reduction default to expand. + setOperationAction(ISD::VECREDUCE_FADD, VT, Expand); + setOperationAction(ISD::VECREDUCE_FMUL, VT, Expand); + setOperationAction(ISD::VECREDUCE_ADD, VT, Expand); + setOperationAction(ISD::VECREDUCE_MUL, VT, Expand); + setOperationAction(ISD::VECREDUCE_AND, VT, Expand); + setOperationAction(ISD::VECREDUCE_OR, VT, Expand); + setOperationAction(ISD::VECREDUCE_XOR, VT, Expand); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Expand); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Expand); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Expand); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Expand); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Expand); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand); } // Most targets ignore the @llvm.prefetch intrinsic. Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -698,14 +698,16 @@ setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Vector reductions - for (MVT VT : MVT::integer_valuetypes()) { + for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, + MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } - for (MVT VT : MVT::fp_valuetypes()) { + for (MVT VT : { MVT::v4f16, MVT::v2f32, + MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); } Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-add-legalization.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-add-legalization.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-add-legalization.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare i1 @llvm.experimental.vector.reduce.add.i1.v1i1(<1 x i1> %a) +declare i8 @llvm.experimental.vector.reduce.add.i8.v1i8(<1 x i8> %a) +declare i16 @llvm.experimental.vector.reduce.add.i16.v1i16(<1 x i16> %a) +declare i24 @llvm.experimental.vector.reduce.add.i24.v1i24(<1 x i24> %a) +declare i32 @llvm.experimental.vector.reduce.add.i32.v1i32(<1 x i32> %a) +declare i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> %a) +declare i128 @llvm.experimental.vector.reduce.add.i128.v1i128(<1 x i128> %a) + +declare i8 @llvm.experimental.vector.reduce.add.i8.v3i8(<3 x i8> %a) +declare i8 @llvm.experimental.vector.reduce.add.i8.v9i8(<9 x i8> %a) +declare i32 @llvm.experimental.vector.reduce.add.i32.v3i32(<3 x i32> %a) +declare i1 @llvm.experimental.vector.reduce.add.i1.v4i1(<4 x i1> %a) +declare i24 @llvm.experimental.vector.reduce.add.i24.v4i24(<4 x i24> %a) +declare i128 @llvm.experimental.vector.reduce.add.i128.v2i128(<2 x i128> %a) +declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a) + +define i1 @test_v1i1(<1 x i1> %a) nounwind { +; CHECK-LABEL: test_v1i1: +; CHECK: // %bb.0: +; CHECK-NEXT: and w0, w0, #0x1 +; CHECK-NEXT: ret + %b = call i1 @llvm.experimental.vector.reduce.add.i1.v1i1(<1 x i1> %a) + ret i1 %b +} + +define i8 @test_v1i8(<1 x i8> %a) nounwind { +; CHECK-LABEL: test_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.add.i8.v1i8(<1 x i8> %a) + ret i8 %b +} + +define i16 @test_v1i16(<1 x i16> %a) nounwind { +; CHECK-LABEL: test_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = call i16 @llvm.experimental.vector.reduce.add.i16.v1i16(<1 x i16> %a) + ret i16 %b +} + +define i24 @test_v1i24(<1 x i24> %a) nounwind { +; CHECK-LABEL: test_v1i24: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call i24 @llvm.experimental.vector.reduce.add.i24.v1i24(<1 x i24> %a) + ret i24 %b +} + +define i32 @test_v1i32(<1 x i32> %a) nounwind { +; CHECK-LABEL: test_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.add.i32.v1i32(<1 x i32> %a) + ret i32 %b +} + +define i64 @test_v1i64(<1 x i64> %a) nounwind { +; CHECK-LABEL: test_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %b = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> %a) + ret i64 %b +} + +define i128 @test_v1i128(<1 x i128> %a) nounwind { +; CHECK-LABEL: test_v1i128: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call i128 @llvm.experimental.vector.reduce.add.i128.v1i128(<1 x i128> %a) + ret i128 %b +} + +define i8 @test_v3i8(<3 x i8> %a) nounwind { +; CHECK-LABEL: test_v3i8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.add.i8.v3i8(<3 x i8> %a) + ret i8 %b +} + +define i8 @test_v9i8(<9 x i8> %a) nounwind { +; CHECK-LABEL: test_v9i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[9], wzr +; CHECK-NEXT: mov v0.b[10], wzr +; CHECK-NEXT: mov v0.b[11], wzr +; CHECK-NEXT: mov v0.b[12], wzr +; CHECK-NEXT: mov v0.b[13], wzr +; CHECK-NEXT: mov v0.b[14], wzr +; CHECK-NEXT: mov v0.b[15], wzr +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.add.i8.v9i8(<9 x i8> %a) + ret i8 %b +} + +define i32 @test_v3i32(<3 x i32> %a) nounwind { +; CHECK-LABEL: test_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.add.i32.v3i32(<3 x i32> %a) + ret i32 %b +} + +define i1 @test_v4i1(<4 x i1> %a) nounwind { +; CHECK-LABEL: test_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %b = call i1 @llvm.experimental.vector.reduce.add.i1.v4i1(<4 x i1> %a) + ret i1 %b +} + +define i24 @test_v4i24(<4 x i24> %a) nounwind { +; CHECK-LABEL: test_v4i24: +; CHECK: // %bb.0: +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i24 @llvm.experimental.vector.reduce.add.i24.v4i24(<4 x i24> %a) + ret i24 %b +} + +define i128 @test_v2i128(<2 x i128> %a) nounwind { +; CHECK-LABEL: test_v2i128: +; CHECK: // %bb.0: +; CHECK-NEXT: adds x0, x0, x2 +; CHECK-NEXT: adcs x1, x1, x3 +; CHECK-NEXT: ret + %b = call i128 @llvm.experimental.vector.reduce.add.i128.v2i128(<2 x i128> %a) + ret i128 %b +} + +define i32 @test_v16i32(<16 x i32> %a) nounwind { +; CHECK-LABEL: test_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %a) + ret i32 %b +} Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-and-legalization.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> %a) +declare i8 @llvm.experimental.vector.reduce.and.i8.v1i8(<1 x i8> %a) +declare i16 @llvm.experimental.vector.reduce.and.i16.v1i16(<1 x i16> %a) +declare i24 @llvm.experimental.vector.reduce.and.i24.v1i24(<1 x i24> %a) +declare i32 @llvm.experimental.vector.reduce.and.i32.v1i32(<1 x i32> %a) +declare i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> %a) +declare i128 @llvm.experimental.vector.reduce.and.i128.v1i128(<1 x i128> %a) + +declare i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a) +declare i8 @llvm.experimental.vector.reduce.and.i8.v9i8(<9 x i8> %a) +declare i32 @llvm.experimental.vector.reduce.and.i32.v3i32(<3 x i32> %a) +declare i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> %a) +declare i24 @llvm.experimental.vector.reduce.and.i24.v4i24(<4 x i24> %a) +declare i128 @llvm.experimental.vector.reduce.and.i128.v2i128(<2 x i128> %a) +declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> %a) + +define i1 @test_v1i1(<1 x i1> %a) nounwind { +; CHECK-LABEL: test_v1i1: +; CHECK: // %bb.0: +; CHECK-NEXT: and w0, w0, #0x1 +; CHECK-NEXT: ret + %b = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> %a) + ret i1 %b +} + +define i8 @test_v1i8(<1 x i8> %a) nounwind { +; CHECK-LABEL: test_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.and.i8.v1i8(<1 x i8> %a) + ret i8 %b +} + +define i16 @test_v1i16(<1 x i16> %a) nounwind { +; CHECK-LABEL: test_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = call i16 @llvm.experimental.vector.reduce.and.i16.v1i16(<1 x i16> %a) + ret i16 %b +} + +define i24 @test_v1i24(<1 x i24> %a) nounwind { +; CHECK-LABEL: test_v1i24: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call i24 @llvm.experimental.vector.reduce.and.i24.v1i24(<1 x i24> %a) + ret i24 %b +} + +define i32 @test_v1i32(<1 x i32> %a) nounwind { +; CHECK-LABEL: test_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.and.i32.v1i32(<1 x i32> %a) + ret i32 %b +} + +define i64 @test_v1i64(<1 x i64> %a) nounwind { +; CHECK-LABEL: test_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %b = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> %a) + ret i64 %b +} + +define i128 @test_v1i128(<1 x i128> %a) nounwind { +; CHECK-LABEL: test_v1i128: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call i128 @llvm.experimental.vector.reduce.and.i128.v1i128(<1 x i128> %a) + ret i128 %b +} + +define i8 @test_v3i8(<3 x i8> %a) nounwind { +; CHECK-LABEL: test_v3i8: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, w1 +; CHECK-NEXT: and w8, w8, w2 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.and.i8.v3i8(<3 x i8> %a) + ret i8 %b +} + +define i8 @test_v9i8(<9 x i8> %a) nounwind { +; CHECK-LABEL: test_v9i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov v0.b[9], w8 +; CHECK-NEXT: mov v0.b[10], w8 +; CHECK-NEXT: mov v0.b[11], w8 +; CHECK-NEXT: mov v0.b[12], w8 +; CHECK-NEXT: mov v0.b[13], w8 +; CHECK-NEXT: mov v0.b[14], w8 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w8, v0.b[1] +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: umov w9, v0.b[2] +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: umov w9, v0.b[3] +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: umov w9, v0.b[4] +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: umov w9, v0.b[5] +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: umov w9, v0.b[6] +; CHECK-NEXT: and w8, w8, w9 +; CHECK-NEXT: umov w9, v0.b[7] +; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.and.i8.v9i8(<9 x i8> %a) + ret i8 %b +} + +define i32 @test_v3i32(<3 x i32> %a) nounwind { +; CHECK-LABEL: test_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.and.i32.v3i32(<3 x i32> %a) + ret i32 %b +} + +define i1 @test_v4i1(<4 x i1> %a) nounwind { +; CHECK-LABEL: test_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: umov w11, v0.h[0] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: and w10, w11, w10 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: and w9, w10, w9 +; CHECK-NEXT: and w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %b = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> %a) + ret i1 %b +} + +define i24 @test_v4i24(<4 x i24> %a) nounwind { +; CHECK-LABEL: test_v4i24: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: ret + %b = call i24 @llvm.experimental.vector.reduce.and.i24.v4i24(<4 x i24> %a) + ret i24 %b +} + +define i128 @test_v2i128(<2 x i128> %a) nounwind { +; CHECK-LABEL: test_v2i128: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, x2 +; CHECK-NEXT: and x1, x1, x3 +; CHECK-NEXT: ret + %b = call i128 @llvm.experimental.vector.reduce.and.i128.v2i128(<2 x i128> %a) + ret i128 %b +} + +define i32 @test_v16i32(<16 x i32> %a) nounwind { +; CHECK-LABEL: test_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> %a) + ret i32 %b +} Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half, <1 x half>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float, <1 x float>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double, <1 x double>) +declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128, <1 x fp128>) + +declare float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float, <3 x float>) +declare fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128, <2 x fp128>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float, <16 x float>) + +define half @test_v1f16(<1 x half> %a) nounwind { +; CHECK-LABEL: test_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call fast nnan half @llvm.experimental.vector.reduce.fadd.f16.v1f16(half 0.0, <1 x half> %a) + ret half %b +} + +define float @test_v1f32(<1 x float> %a) nounwind { +; CHECK-LABEL: test_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret + %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v1f32(float 0.0, <1 x float> %a) + ret float %b +} + +define double @test_v1f64(<1 x double> %a) nounwind { +; CHECK-LABEL: test_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call fast nnan double @llvm.experimental.vector.reduce.fadd.f64.v1f64(double 0.0, <1 x double> %a) + ret double %b +} + +define fp128 @test_v1f128(<1 x fp128> %a) nounwind { +; CHECK-LABEL: test_v1f128: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v1f128(fp128 zeroinitializer, <1 x fp128> %a) + ret fp128 %b +} + +define float @test_v3f32(<3 x float> %a) nounwind { +; CHECK-LABEL: test_v3f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, wzr +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret + %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) + ret float %b +} + +define fp128 @test_v2f128(<2 x fp128> %a) nounwind { +; CHECK-LABEL: test_v2f128: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl __addtf3 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %b = call fast nnan fp128 @llvm.experimental.vector.reduce.fadd.f128.v2f128(fp128 zeroinitializer, <2 x fp128> %a) + ret fp128 %b +} + +define float @test_v16f32(<16 x float> %a) nounwind { +; CHECK-LABEL: test_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fadd v1.4s, v1.4s, v3.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret + %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) + ret float %b +} Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare half @llvm.experimental.vector.reduce.fmax.f16.v1f16(<1 x half> %a) +declare float @llvm.experimental.vector.reduce.fmax.f32.v1f32(<1 x float> %a) +declare double @llvm.experimental.vector.reduce.fmax.f64.v1f64(<1 x double> %a) +declare fp128 @llvm.experimental.vector.reduce.fmax.f128.v1f128(<1 x fp128> %a) + +declare float @llvm.experimental.vector.reduce.fmax.f32.v3f32(<3 x float> %a) +declare fp128 @llvm.experimental.vector.reduce.fmax.f128.v2f128(<2 x fp128> %a) +declare float @llvm.experimental.vector.reduce.fmax.f32.v16f32(<16 x float> %a) + +define half @test_v1f16(<1 x half> %a) nounwind { +; CHECK-LABEL: test_v1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call nnan half @llvm.experimental.vector.reduce.fmax.f16.v1f16(<1 x half> %a) + ret half %b +} + +define float @test_v1f32(<1 x float> %a) nounwind { +; CHECK-LABEL: test_v1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret + %b = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v1f32(<1 x float> %a) + ret float %b +} + +define double @test_v1f64(<1 x double> %a) nounwind { +; CHECK-LABEL: test_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call nnan double @llvm.experimental.vector.reduce.fmax.f64.v1f64(<1 x double> %a) + ret double %b +} + +define fp128 @test_v1f128(<1 x fp128> %a) nounwind { +; CHECK-LABEL: test_v1f128: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.f128.v1f128(<1 x fp128> %a) + ret fp128 %b +} + +define float @test_v3f32(<3 x float> %a) nounwind { +; CHECK-LABEL: test_v3f32: +; CHECK: // %bb.0: +; CHECK-NEXT: orr w8, wzr, #0x7f800000 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: fmaxnmv s0, v0.4s +; CHECK-NEXT: ret + %b = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v3f32(<3 x float> %a) + ret float %b +} + +define fp128 @test_v2f128(<2 x fp128> %a) nounwind { +; CHECK-LABEL: test_v2f128: +; CHECK: // %bb.0: +; CHECK-NEXT: b fmaxl + %b = call nnan fp128 @llvm.experimental.vector.reduce.fmax.f128.v2f128(<2 x fp128> %a) + ret fp128 %b +} + +define float @test_v16f32(<16 x float> %a) nounwind { +; CHECK-LABEL: test_v16f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fmaxnm v1.4s, v1.4s, v3.4s +; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v2.4s +; CHECK-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fmaxnmv s0, v0.4s +; CHECK-NEXT: ret + %b = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v16f32(<16 x float> %a) + ret float %b +} Index: llvm/trunk/test/CodeGen/AArch64/vecreduce-umax-legalization.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ llvm/trunk/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK + +declare i1 @llvm.experimental.vector.reduce.umax.i1.v1i1(<1 x i1> %a) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v1i8(<1 x i8> %a) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v1i16(<1 x i16> %a) +declare i24 @llvm.experimental.vector.reduce.umax.i24.v1i24(<1 x i24> %a) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v1i32(<1 x i32> %a) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> %a) +declare i128 @llvm.experimental.vector.reduce.umax.i128.v1i128(<1 x i128> %a) + +declare i8 @llvm.experimental.vector.reduce.umax.i8.v3i8(<3 x i8> %a) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v9i8(<9 x i8> %a) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v3i32(<3 x i32> %a) +declare i1 @llvm.experimental.vector.reduce.umax.i1.v4i1(<4 x i1> %a) +declare i24 @llvm.experimental.vector.reduce.umax.i24.v4i24(<4 x i24> %a) +declare i128 @llvm.experimental.vector.reduce.umax.i128.v2i128(<2 x i128> %a) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %a) + +define i1 @test_v1i1(<1 x i1> %a) nounwind { +; CHECK-LABEL: test_v1i1: +; CHECK: // %bb.0: +; CHECK-NEXT: and w0, w0, #0x1 +; CHECK-NEXT: ret + %b = call i1 @llvm.experimental.vector.reduce.umax.i1.v1i1(<1 x i1> %a) + ret i1 %b +} + +define i8 @test_v1i8(<1 x i8> %a) nounwind { +; CHECK-LABEL: test_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.umax.i8.v1i8(<1 x i8> %a) + ret i8 %b +} + +define i16 @test_v1i16(<1 x i16> %a) nounwind { +; CHECK-LABEL: test_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = call i16 @llvm.experimental.vector.reduce.umax.i16.v1i16(<1 x i16> %a) + ret i16 %b +} + +define i24 @test_v1i24(<1 x i24> %a) nounwind { +; CHECK-LABEL: test_v1i24: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call i24 @llvm.experimental.vector.reduce.umax.i24.v1i24(<1 x i24> %a) + ret i24 %b +} + +define i32 @test_v1i32(<1 x i32> %a) nounwind { +; CHECK-LABEL: test_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.umax.i32.v1i32(<1 x i32> %a) + ret i32 %b +} + +define i64 @test_v1i64(<1 x i64> %a) nounwind { +; CHECK-LABEL: test_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %b = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> %a) + ret i64 %b +} + +define i128 @test_v1i128(<1 x i128> %a) nounwind { +; CHECK-LABEL: test_v1i128: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %b = call i128 @llvm.experimental.vector.reduce.umax.i128.v1i128(<1 x i128> %a) + ret i128 %b +} + +define i8 @test_v3i8(<3 x i8> %a) nounwind { +; CHECK-LABEL: test_v3i8: +; CHECK: // %bb.0: +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-NEXT: umaxv h0, v0.4h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.umax.i8.v3i8(<3 x i8> %a) + ret i8 %b +} + +define i8 @test_v9i8(<9 x i8> %a) nounwind { +; CHECK-LABEL: test_v9i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.b[9], wzr +; CHECK-NEXT: mov v0.b[10], wzr +; CHECK-NEXT: mov v0.b[11], wzr +; CHECK-NEXT: mov v0.b[12], wzr +; CHECK-NEXT: mov v0.b[13], wzr +; CHECK-NEXT: mov v0.b[14], wzr +; CHECK-NEXT: mov v0.b[15], wzr +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i8 @llvm.experimental.vector.reduce.umax.i8.v9i8(<9 x i8> %a) + ret i8 %b +} + +define i32 @test_v3i32(<3 x i32> %a) nounwind { +; CHECK-LABEL: test_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov v0.s[3], wzr +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.umax.i32.v3i32(<3 x i32> %a) + ret i32 %b +} + +define i1 @test_v4i1(<4 x i1> %a) nounwind { +; CHECK-LABEL: test_v4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.4h, #1 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %b = call i1 @llvm.experimental.vector.reduce.umax.i1.v4i1(<4 x i1> %a) + ret i1 %b +} + +define i24 @test_v4i24(<4 x i24> %a) nounwind { +; CHECK-LABEL: test_v4i24: +; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.4s, #255, lsl #24 +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i24 @llvm.experimental.vector.reduce.umax.i24.v4i24(<4 x i24> %a) + ret i24 %b +} + +define i128 @test_v2i128(<2 x i128> %a) nounwind { +; CHECK-LABEL: test_v2i128: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: csel x8, x0, x2, hi +; CHECK-NEXT: cmp x1, x3 +; CHECK-NEXT: csel x9, x0, x2, hi +; CHECK-NEXT: csel x0, x8, x9, eq +; CHECK-NEXT: csel x1, x1, x3, hi +; CHECK-NEXT: ret + %b = call i128 @llvm.experimental.vector.reduce.umax.i128.v2i128(<2 x i128> %a) + ret i128 %b +} + +define i32 @test_v16i32(<16 x i32> %a) nounwind { +; CHECK-LABEL: test_v16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umax v1.4s, v1.4s, v3.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s +; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %b = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %a) + ret i32 %b +}