diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h --- a/llvm/lib/Target/VE/VECustomDAG.h +++ b/llvm/lib/Target/VE/VECustomDAG.h @@ -24,6 +24,7 @@ Optional getVVPOpcode(unsigned Opcode); bool isVVPBinaryOp(unsigned Opcode); +bool isVVPReductionOp(unsigned Opcode); MVT splitVectorType(MVT VT); @@ -106,6 +107,12 @@ SDValue getGatherScatterScale(SDValue Op); +unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask); + +// Whether this VP_REDUCE_*/ VECREDUCE_*/VVP_REDUCE_* SDNode has a start +// parameter. +bool hasReductionStartParam(unsigned VVPOC); + /// } Node Properties enum class Packing { @@ -172,6 +179,12 @@ SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); } /// } getNode + /// Legalizing getNode { + SDValue getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, SDValue StartV, + SDValue VectorV, SDValue Mask, SDValue AVL, + SDNodeFlags Flags) const; + /// } Legalizing getNode + /// Packing { SDValue getUnpack(EVT DestVT, SDValue Vec, PackElem Part, SDValue AVL) const; SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const; diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp --- a/llvm/lib/Target/VE/VECustomDAG.cpp +++ b/llvm/lib/Target/VE/VECustomDAG.cpp @@ -138,6 +138,15 @@ return false; } +bool isVVPReductionOp(unsigned Opcode) { + switch (Opcode) { +#define ADD_REDUCE_VVP_OP(VVP_NAME, SDNAME) case VEISD::VVP_NAME: +#include "VVPNodes.def" + return true; + } + return false; +} + // Return the AVL operand position for this VVP or VEC Op. Optional getAVLPos(unsigned Opc) { // This is only available for VP SDNodes @@ -235,9 +244,14 @@ } // Translate to VVP where possible. + unsigned OriginalOC = OC; if (auto VVPOpc = getVVPOpcode(OC)) OC = *VVPOpc; + if (isVVPReductionOp(OC)) + return Op->getOperand(hasReductionStartParam(OriginalOC) ? 1 : 0) + .getValueType(); + switch (OC) { default: case VEISD::VVP_SETCC: @@ -320,6 +334,27 @@ return SDValue(); } +bool hasReductionStartParam(unsigned OPC) { + // TODO: Ordered reduction opcodes. + if (ISD::isVPReduction(OPC)) + return true; + return false; +} + +unsigned getScalarReductionOpcode(unsigned VVPOC, bool IsMask) { + assert(!IsMask && "Mask reduction isel"); + + switch (VVPOC) { +#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD) \ + case VEISD::VVP_RED_ISD: \ + return ISD::REDUCE_ISD; +#include "VVPNodes.def" + default: + break; + } + llvm_unreachable("Cannot not scalarize this reduction Opcode!"); +} + /// } Node Properties SDValue getNodeAVL(SDValue Op) { @@ -499,4 +534,31 @@ return ResPtr; } +SDValue VECustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, + SDValue StartV, SDValue VectorV, + SDValue Mask, SDValue AVL, + SDNodeFlags Flags) const { + + // Optionally attach the start param with a scalar op (where it is + // unsupported). + bool scalarizeStartParam = StartV && !hasReductionStartParam(VVPOpcode); + bool IsMaskReduction = isMaskType(VectorV.getValueType()); + assert(!IsMaskReduction && "TODO Implement"); + auto AttachStartValue = [&](SDValue ReductionResV) { + if (!scalarizeStartParam) + return ReductionResV; + auto ScalarOC = getScalarReductionOpcode(VVPOpcode, IsMaskReduction); + return getNode(ScalarOC, ResVT, {StartV, ReductionResV}); + }; + + // Fixup: Always Use sequential 'fmul' reduction. + if (!scalarizeStartParam && StartV) { + assert(hasReductionStartParam(VVPOpcode)); + return AttachStartValue( + getNode(VVPOpcode, ResVT, {StartV, VectorV, Mask, AVL}, Flags)); + } else + return AttachStartValue( + getNode(VVPOpcode, ResVT, {VectorV, Mask, AVL}, Flags)); +} + } // namespace llvm diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -332,6 +332,14 @@ for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE}) setOperationAction(MemOpc, VT, Custom); + + const ISD::NodeType IntReductionOCs[] = { + ISD::VECREDUCE_ADD, ISD::VECREDUCE_MUL, ISD::VECREDUCE_AND, + ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR, ISD::VECREDUCE_SMIN, + ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX}; + + for (unsigned IntRedOpc : IntReductionOCs) + setOperationAction(IntRedOpc, VT, Custom); } } diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h --- a/llvm/lib/Target/VE/VETargetTransformInfo.h +++ b/llvm/lib/Target/VE/VETargetTransformInfo.h @@ -61,6 +61,25 @@ bool enableVPU() const { return getST()->enableVPU(); } + static bool isSupportedReduction(Intrinsic::ID ReductionID) { +#define VEC_VP_CASE(SUFFIX) \ + case Intrinsic::vp_reduce_##SUFFIX: \ + case Intrinsic::vector_reduce_##SUFFIX: + + switch (ReductionID) { + VEC_VP_CASE(add) + VEC_VP_CASE(and) + VEC_VP_CASE(or) + VEC_VP_CASE(xor) + VEC_VP_CASE(smax) + return true; + + default: + return false; + } +#undef VEC_VP_CASE + } + public: explicit VETTIImpl(const VETargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -127,6 +146,12 @@ return isVectorLaneType(*getLaneType(DataType)); } // } Load & Store + + bool shouldExpandReduction(const IntrinsicInst *II) const { + if (!enableVPU()) + return true; + return !isSupportedReduction(II->getIntrinsicID()); + } }; } // namespace llvm diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp --- a/llvm/lib/Target/VE/VVPISelLowering.cpp +++ b/llvm/lib/Target/VE/VVPISelLowering.cpp @@ -56,7 +56,7 @@ return lowerVVP_GATHER_SCATTER(Op, CDAG); } - EVT OpVecVT = Op.getValueType(); + EVT OpVecVT = *getIdiomaticVectorType(Op.getNode()); EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT); auto Packing = getTypePacking(LegalVecVT.getSimpleVT()); @@ -84,6 +84,14 @@ return CDAG.getNode(VVPOpcode, LegalVecVT, {Op->getOperand(0), Op->getOperand(1), Mask, AVL}); } + if (isVVPReductionOp(VVPOpcode)) { + auto SrcHasStart = hasReductionStartParam(Op->getOpcode()); + SDValue StartV = SrcHasStart ? Op->getOperand(0) : SDValue(); + SDValue VectorV = Op->getOperand(SrcHasStart ? 1 : 0); + return CDAG.getLegalReductionOpVVP(VVPOpcode, Op.getValueType(), StartV, + VectorV, Mask, AVL, Op->getFlags()); + } + if (VVPOpcode == VEISD::VVP_SELECT) { auto Mask = Op->getOperand(0); auto OnTrue = Op->getOperand(1); @@ -91,10 +99,11 @@ return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL}); } if (VVPOpcode == VEISD::VVP_SETCC) { + EVT LegalResVT = getTypeToTransformTo(*DAG.getContext(), Op.getValueType()); auto LHS = Op->getOperand(0); auto RHS = Op->getOperand(1); auto Pred = Op->getOperand(2); - return CDAG.getNode(VVPOpcode, LegalVecVT, {LHS, RHS, Pred, Mask, AVL}); + return CDAG.getNode(VVPOpcode, LegalResVT, {LHS, RHS, Pred, Mask, AVL}); } llvm_unreachable("lowerToVVP called for unexpected SDNode."); } diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td --- a/llvm/lib/Target/VE/VVPInstrInfo.td +++ b/llvm/lib/Target/VE/VVPInstrInfo.td @@ -53,8 +53,6 @@ IsVLVT<3> ]>; -// Binary Operators { - // BinaryOp(x,y,mask,vl) def SDTIntBinOpVVP : SDTypeProfile<1, 4, [ // vp_add, vp_and, etc. SDTCisSameAs<0, 1>, @@ -95,6 +93,15 @@ IsVLVT<5> ]>; +// vvp_reduce(vector, mask, vl) +def SDTReduceVVP : SDTypeProfile<1, 3, [ + SDTCisVec<1>, + SDTCisInt<2>, + SDTCisVec<2>, + SDTCisSameNumEltsAs<1,2>, + IsVLVT<3> +]>; + // Binary operator commutative pattern. class vvp_commutative : @@ -135,8 +142,6 @@ def c_vvp_fmul : vvp_commutative; def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>; -// } Binary Operators - def vvp_scatter : SDNode<"VEISD::VVP_SCATTER", SDTScatterVVP, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def vvp_gather : SDNode<"VEISD::VVP_GATHER", SDTGatherVVP, @@ -147,6 +152,15 @@ def vvp_store : SDNode<"VEISD::VVP_STORE", SDTStoreVVP, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +// Reductions + +// int reductions +def vvp_reduce_add : SDNode<"VEISD::VVP_REDUCE_ADD", SDTReduceVVP>; +def vvp_reduce_and : SDNode<"VEISD::VVP_REDUCE_AND", SDTReduceVVP>; +def vvp_reduce_or : SDNode<"VEISD::VVP_REDUCE_OR", SDTReduceVVP>; +def vvp_reduce_xor : SDNode<"VEISD::VVP_REDUCE_XOR", SDTReduceVVP>; +def vvp_reduce_smax : SDNode<"VEISD::VVP_REDUCE_SMAX", SDTReduceVVP>; + def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>; diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td --- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td @@ -434,3 +434,36 @@ defm : Set_CC; defm : Set_CC; defm : Set_CC; + +multiclass Reduce_GenericInt { + // Unmasked. + def : Pat <(ResVT (!cast("vvp_reduce_"#VVPRedOp) + VectorVT:$vx, (v256i1 true_mask), i32:$vl)), + (COPY_TO_REGCLASS + (!cast("LVSvi") + (!cast(RedInstName#"vl") $vx, $vl), 0), + ResRC)>; + + // Masked. + def : Pat <(ResVT (!cast("vvp_reduce_"#VVPRedOp) + VectorVT:$vx, v256i1:$vm, i32:$vl)), + (COPY_TO_REGCLASS + (!cast("LVSvi") + (!cast(RedInstName#"vml") $vx, $vm, $vl), 0), + ResRC)>; +} + +multiclass IntReduce_ShortLong { + defm: Reduce_GenericInt; + defm: Reduce_GenericInt; + defm: Reduce_GenericInt; + defm: Reduce_GenericInt; + defm: Reduce_GenericInt; +} + +defm: IntReduce_ShortLong; +defm: IntReduce_ShortLong; diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def --- a/llvm/lib/Target/VE/VVPNodes.def +++ b/llvm/lib/Target/VE/VVPNodes.def @@ -44,12 +44,38 @@ #define REGISTER_PACKED(OPC) #endif -ADD_VVP_OP(VVP_GATHER, MGATHER) HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER) -ADD_VVP_OP(VVP_SCATTER, MSCATTER) HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER) +/// ADD_REDUCE_VVP_OP(OPC) +/// \p OPC The VVP opcode of the operation. +/// \p SDNAME The standard opcode of the operation. +#ifndef ADD_REDUCE_VVP_OP +#define ADD_REDUCE_VVP_OP(OPC, SDNAME) ADD_VVP_OP(OPC, SDNAME) +#endif + +// Scalar standard ISD to perform this reduction. +#ifndef HANDLE_VVP_REDUCE_TO_SCALAR +#define HANDLE_VVP_REDUCE_TO_SCALAR(VVP_RED_ISD, REDUCE_ISD) +#endif + +/// Reductions. +#define HELPER_REDUCTION(OPC, SCALAR_OPC) \ + ADD_REDUCE_VVP_OP(VVP_REDUCE_##OPC,VECREDUCE_##OPC) \ + HANDLE_VP_TO_VVP(VP_REDUCE_##OPC, VVP_REDUCE_##OPC) \ + HANDLE_VVP_REDUCE_TO_SCALAR(VVP_REDUCE_##OPC, SCALAR_OPC) + +HELPER_REDUCTION(ADD, ADD) +HELPER_REDUCTION(AND, AND) +HELPER_REDUCTION(OR, OR) +HELPER_REDUCTION(XOR, XOR) +HELPER_REDUCTION(SMAX, SMAX) + +#undef HELPER_REDUCTION ADD_VVP_OP(VVP_LOAD,LOAD) HANDLE_VP_TO_VVP(VP_LOAD, VVP_LOAD) REGISTER_PACKED(VVP_LOAD) ADD_VVP_OP(VVP_STORE,STORE) HANDLE_VP_TO_VVP(VP_STORE, VVP_STORE) REGISTER_PACKED(VVP_STORE) +ADD_VVP_OP(VVP_GATHER, MGATHER) HANDLE_VP_TO_VVP(VP_GATHER, VVP_GATHER) +ADD_VVP_OP(VVP_SCATTER, MSCATTER) HANDLE_VP_TO_VVP(VP_SCATTER, VVP_SCATTER) + // Integer arithmetic. ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD) ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB) @@ -78,8 +104,11 @@ HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT) HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT) + #undef ADD_BINARY_VVP_OP #undef ADD_BINARY_VVP_OP_COMPACT +#undef ADD_REDUCE_VVP_OP #undef ADD_VVP_OP #undef HANDLE_VP_TO_VVP +#undef HANDLE_VVP_REDUCE_TO_SCALAR #undef REGISTER_PACKED diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_add.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vector.reduce.add.v256i64(<256 x i64>) + +define fastcc i64 @vec_reduce_add_v256i64(<256 x i64> %v) { +; CHECK-LABEL: vec_reduce_add_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vsum.l %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vector.reduce.add.v256i64( <256 x i64> %v) + ret i64 %r +} + +declare i32 @llvm.vector.reduce.add.v256i32(<256 x i32>) + +define fastcc i32 @vec_reduce_add_v256i32(<256 x i32> %v) { +; CHECK-LABEL: vec_reduce_add_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vsum.w.sx %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vector.reduce.add.v256i32( <256 x i32> %v) + ret i32 %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_and.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vector.reduce.and.v256i64(<256 x i64>) + +define fastcc i64 @vec_reduce_and_v256i64(<256 x i64> %v) { +; CHECK-LABEL: vec_reduce_and_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vrand %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vector.reduce.and.v256i64( <256 x i64> %v) + ret i64 %r +} + +declare i32 @llvm.vector.reduce.and.v256i32(<256 x i32>) + +define fastcc i32 @vec_reduce_and_v256i32(<256 x i32> %v) { +; CHECK-LABEL: vec_reduce_and_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vrand %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vector.reduce.and.v256i32( <256 x i32> %v) + ret i32 %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_or.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vector.reduce.or.v256i64(<256 x i64>) + +define fastcc i64 @vec_reduce_or_v256i64(<256 x i64> %v) { +; CHECK-LABEL: vec_reduce_or_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vror %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vector.reduce.or.v256i64( <256 x i64> %v) + ret i64 %r +} + +declare i32 @llvm.vector.reduce.or.v256i32(<256 x i32>) + +define fastcc i32 @vec_reduce_or_v256i32(<256 x i32> %v) { +; CHECK-LABEL: vec_reduce_or_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vror %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vector.reduce.or.v256i32( <256 x i32> %v) + ret i32 %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_smax.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vector.reduce.smax.v256i64(<256 x i64>) + +define fastcc i64 @vec_reduce_smax_v256i64(<256 x i64> %v) { +; CHECK-LABEL: vec_reduce_smax_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vrmaxs.l.fst %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vector.reduce.smax.v256i64( <256 x i64> %v) + ret i64 %r +} + +declare i32 @llvm.vector.reduce.smax.v256i32(<256 x i32>) + +define fastcc i32 @vec_reduce_smax_v256i32(<256 x i32> %v) { +; CHECK-LABEL: vec_reduce_smax_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vrmaxs.w.fst.sx %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vector.reduce.smax.v256i32( <256 x i32> %v) + ret i32 %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll b/llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_reduce_xor.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vector.reduce.xor.v256i64(<256 x i64>) + +define fastcc i64 @vec_reduce_xor_v256i64(<256 x i64> %v) { +; CHECK-LABEL: vec_reduce_xor_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vrxor %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vector.reduce.xor.v256i64( <256 x i64> %v) + ret i64 %r +} + +declare i32 @llvm.vector.reduce.xor.v256i32(<256 x i32>) + +define fastcc i32 @vec_reduce_xor_v256i32(<256 x i32> %v) { +; CHECK-LABEL: vec_reduce_xor_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vrxor %v0, %v0 +; CHECK-NEXT: lvs %s0, %v0(0) +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vector.reduce.xor.v256i32( <256 x i32> %v) + ret i32 %r +} diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_add.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vp.reduce.add.v256i64(i64, <256 x i64>, <256 x i1>, i32) + +define fastcc i64 @vp_reduce_add_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_add_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vsum.l %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: adds.l %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vp.reduce.add.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) + ret i64 %r +} + +declare i32 @llvm.vp.reduce.add.v256i32(i32, <256 x i32>, <256 x i1>, i32) + +define fastcc i32 @vp_reduce_add_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_add_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vsum.w.sx %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: or %s1, 0, %s1 +; CHECK-NEXT: adds.w.sx %s1, %s0, %s1 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vp.reduce.add.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) + ret i32 %r +} + + diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_and.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vp.reduce.and.v256i64(i64, <256 x i64>, <256 x i1>, i32) + +define fastcc i64 @vp_reduce_and_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_and_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vrand %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vp.reduce.and.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) + ret i64 %r +} + +declare i32 @llvm.vp.reduce.and.v256i32(i32, <256 x i32>, <256 x i1>, i32) + +define fastcc i32 @vp_reduce_and_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_and_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s0, (32)0 +; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vrand %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: or %s2, 0, %s1 +; CHECK-NEXT: # implicit-def: $sx1 +; CHECK-NEXT: or %s1, 0, %s2 +; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vp.reduce.and.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) + ret i32 %r +} + + diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_or.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vp.reduce.or.v256i64(i64, <256 x i64>, <256 x i1>, i32) + +define fastcc i64 @vp_reduce_or_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_or_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vror %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vp.reduce.or.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) + ret i64 %r +} + +declare i32 @llvm.vp.reduce.or.v256i32(i32, <256 x i32>, <256 x i1>, i32) + +define fastcc i32 @vp_reduce_or_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_or_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s0, (32)0 +; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vror %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: or %s2, 0, %s1 +; CHECK-NEXT: # implicit-def: $sx1 +; CHECK-NEXT: or %s1, 0, %s2 +; CHECK-NEXT: or %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vp.reduce.or.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) + ret i32 %r +} + + diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_smax.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vp.reduce.smax.v256i64(i64, <256 x i64>, <256 x i1>, i32) + +define fastcc i64 @vp_reduce_smax_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_smax_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vrmaxs.l.fst %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: maxs.l %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vp.reduce.smax.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) + ret i64 %r +} + +declare i32 @llvm.vp.reduce.smax.v256i32(i32, <256 x i32>, <256 x i1>, i32) + +define fastcc i32 @vp_reduce_smax_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_smax_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: # kill: def $sw0 killed $sw0 killed $sx0 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vrmaxs.w.fst.sx %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: or %s1, 0, %s1 +; CHECK-NEXT: maxs.w.sx %s1, %s0, %s1 +; CHECK-NEXT: # implicit-def: $sx0 +; CHECK-NEXT: or %s0, 0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vp.reduce.smax.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) + ret i32 %r +} + + diff --git a/llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll b/llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vp_reduce_xor.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 --march=ve -mattr=+vpu %s -o=/dev/stdout | FileCheck %s + +declare i64 @llvm.vp.reduce.xor.v256i64(i64, <256 x i64>, <256 x i1>, i32) + +define fastcc i64 @vp_reduce_xor_v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_xor_v256i64: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vrxor %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: xor %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i64 @llvm.vp.reduce.xor.v256i64(i64 %s, <256 x i64> %v, <256 x i1> %m, i32 %n) + ret i64 %r +} + +declare i32 @llvm.vp.reduce.xor.v256i32(i32, <256 x i32>, <256 x i1>, i32) + +define fastcc i32 @vp_reduce_xor_v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) { +; CHECK-LABEL: vp_reduce_xor_v256i32: +; CHECK: # %bb.0: +; CHECK-NEXT: and %s2, %s0, (32)0 +; CHECK-NEXT: # kill: def $sw2 killed $sw2 killed $sx2 +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 +; CHECK-NEXT: lvl %s1 +; CHECK-NEXT: vrxor %v0, %v0, %vm1 +; CHECK-NEXT: lvs %s1, %v0(0) +; CHECK-NEXT: or %s2, 0, %s1 +; CHECK-NEXT: # implicit-def: $sx1 +; CHECK-NEXT: or %s1, 0, %s2 +; CHECK-NEXT: xor %s0, %s0, %s1 +; CHECK-NEXT: b.l.t (, %s10) + %r = call i32 @llvm.vp.reduce.xor.v256i32(i32 %s, <256 x i32> %v, <256 x i1> %m, i32 %n) + ret i32 %r +} + +