Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20857,7 +20857,7 @@ unsigned Opcode = N->getOpcode(); // VECREDUCE over 1-element vector is just an extract. - if (VT.getVectorNumElements() == 1) { + if (VT.getVectorElementCount().isScalar()) { SDLoc dl(N); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3321,7 +3321,7 @@ SDValue EltNo = Op.getOperand(1); EVT VecVT = InVec.getValueType(); const unsigned EltBitWidth = VecVT.getScalarSizeInBits(); - const unsigned NumSrcElts = VecVT.getVectorNumElements(); + const unsigned NumSrcElts = VecVT.getVectorMinNumElements(); // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know // anything about the extended bits. Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8014,6 +8014,10 @@ SDValue Op = Node->getOperand(0); EVT VT = Op.getValueType(); + if (VT.isScalableVector()) + report_fatal_error( + "Expanding reductions for scalable vectors is undefined."); + // Try to use a shuffle reduction for power of two vectors. if (VT.isPow2VectorType()) { while (VT.getVectorNumElements() > 1) { Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -935,8 +935,9 @@ SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFixedLengthReductionToSVE(unsigned Opcode, SDValue ScalarOp, - SelectionDAG &DAG) const; + SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const; + SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp, + SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1013,6 +1013,14 @@ setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); } // Illegal unpacked integer vector types. @@ -1025,8 +1033,19 @@ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SMIN, VT, Custom); + setOperationAction(ISD::UMIN, VT, Custom); + setOperationAction(ISD::SMAX, VT, Custom); + setOperationAction(ISD::UMAX, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); // There are no legal MVT::nxv16f## based types. if (VT != MVT::nxv16i1) { @@ -3914,15 +3933,27 @@ case ISD::UDIV: return LowerDIV(Op, DAG); case ISD::SMIN: + if (Op.getValueType().getVectorElementType() == MVT::i1) + return DAG.getNode(ISD::OR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED, /*OverrideNEON=*/true); case ISD::UMIN: + if (Op.getValueType().getVectorElementType() == MVT::i1) + return DAG.getNode(ISD::AND, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED, /*OverrideNEON=*/true); case ISD::SMAX: + if (Op.getValueType().getVectorElementType() == MVT::i1) + return DAG.getNode(ISD::AND, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED, /*OverrideNEON=*/true); case ISD::UMAX: + if (Op.getValueType().getVectorElementType() == MVT::i1) + return DAG.getNode(ISD::OR, SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Op.getOperand(1)); return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED, /*OverrideNEON=*/true); case ISD::SRA: @@ -9767,30 +9798,45 @@ Op.getOpcode() == ISD::VECREDUCE_FADD || (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); - if (useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + if (SrcVT.isScalableVector() || + useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: - return LowerFixedLengthReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); case ISD::VECREDUCE_AND: - return LowerFixedLengthReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); case ISD::VECREDUCE_OR: - return LowerFixedLengthReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); case ISD::VECREDUCE_SMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); case ISD::VECREDUCE_SMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); case ISD::VECREDUCE_UMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); case ISD::VECREDUCE_UMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); case ISD::VECREDUCE_XOR: - return LowerFixedLengthReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); case ISD::VECREDUCE_FADD: - return LowerFixedLengthReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); case ISD::VECREDUCE_FMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); case ISD::VECREDUCE_FMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); default: llvm_unreachable("Unhandled fixed length reduction"); } @@ -16257,15 +16303,58 @@ return convertFromScalableVector(DAG, VT, ScalableRes); } -SDValue AArch64TargetLowering::LowerFixedLengthReductionToSVE(unsigned Opcode, - SDValue ScalarOp, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, + SelectionDAG &DAG) const { + SDLoc DL(ReduceOp); + SDValue Op = ReduceOp.getOperand(0); + EVT OpVT = Op.getValueType(); + + if (!OpVT.isScalableVector() && OpVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue Pg = getPredicateForVector(DAG, DL, OpVT); + + AArch64CC::CondCode Cond; + switch (ReduceOp.getOpcode()) { + default: + return SDValue(); + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_SMIN: + Cond = AArch64CC::ANY_ACTIVE; + return getPTest(DAG, ReduceOp.getValueType(), Pg, Op, Cond); + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_SMAX: + Cond = AArch64CC::NONE_ACTIVE; + Op = DAG.getNode(ISD::XOR, SDLoc(ReduceOp), OpVT, Op, Pg); + return getPTest(DAG, ReduceOp.getValueType(), Pg, Op, Cond); + case ISD::VECREDUCE_XOR: + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); + SDValue Cntp = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); + SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Cntp, + DAG.getConstant(1, DL, MVT::i64)); + return DAG.getAnyExtOrTrunc(And, DL, MVT::i32); + } + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, + SDValue ScalarOp, + SelectionDAG &DAG) const { SDLoc DL(ScalarOp); SDValue VecOp = ScalarOp.getOperand(0); EVT SrcVT = VecOp.getValueType(); + if (useSVEForFixedLengthVectorVT(SrcVT, true)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); + VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + } + SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); - VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); // UADDV always returns an i64 result. EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : Index: llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll @@ -0,0 +1,375 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i1 @reduce_and_nxv16i1( %vec) { +; CHECK-LABEL: reduce_and_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv8i1( %vec) { +; CHECK-LABEL: reduce_and_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv4i1( %vec) { +; CHECK-LABEL: reduce_and_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv2i1( %vec) { +; CHECK-LABEL: reduce_and_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv2i1( %vec) + ret i1 %res +} + +; ORV + +define i1 @reduce_or_nxv16i1( %vec) { +; CHECK-LABEL: reduce_or_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv8i1( %vec) { +; CHECK-LABEL: reduce_or_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv4i1( %vec) { +; CHECK-LABEL: reduce_or_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv2i1( %vec) { +; CHECK-LABEL: reduce_or_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv2i1( %vec) + ret i1 %res +} + +; XORV + +define i1 @reduce_xor_nxv16i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cntp x8, p1, p0.b +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_xor_nxv8i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: cntp x8, p1, p0.h +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_xor_nxv4i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: cntp x8, p1, p0.s +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_xor_nxv2i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: cntp x8, p1, p0.d +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv2i1( %vec) + ret i1 %res +} + +; SMAXV + +define i1 @reduce_smax_nxv16i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_smax_nxv8i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_smax_nxv4i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_smax_nxv2i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv2i1( %vec) + ret i1 %res +} + +; SMINV + +define i1 @reduce_smin_nxv16i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_smin_nxv8i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_smin_nxv4i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_smin_nxv2i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv2i1( %vec) + ret i1 %res +} + +; UMAXV + +define i1 @reduce_umax_nxv16i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_umax_nxv8i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_umax_nxv4i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_umax_nxv2i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv2i1( %vec) + ret i1 %res +} + +; UMINV + +define i1 @reduce_umin_nxv16i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_umin_nxv8i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_umin_nxv4i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_umin_nxv2i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv2i1( %vec) + ret i1 %res +} + +declare i1 @llvm.vector.reduce.and.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.or.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.xor.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.xor.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.xor.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.xor.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.smin.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.smin.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.smin.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.smin.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.smax.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.smax.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.smax.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.smax.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.umin.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.umin.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.umin.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.umin.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.umax.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.umax.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.umax.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.umax.i1.nxv2i1( %vec) Index: llvm/test/CodeGen/AArch64/sve-int-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -0,0 +1,417 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i8 @andv_nxv16i8( %a) { +; CHECK-LABEL: andv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.and.nxv16i8( %a) + ret i8 %res +} + +define i16 @andv_nxv8i16( %a) { +; CHECK-LABEL: andv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: andv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.and.nxv8i16( %a) + ret i16 %res +} + +define i32 @andv_nxv4i32( %a) { +; CHECK-LABEL: andv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.and.nxv4i32( %a) + ret i32 %res +} + +define i64 @andv_nxv2i64( %a) { +; CHECK-LABEL: andv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: andv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.and.nxv2i64( %a) + ret i64 %res +} + +; ORV + +define i8 @orv_nxv16i8( %a) { +; CHECK-LABEL: orv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.or.nxv16i8( %a) + ret i8 %res +} + +define i16 @orv_nxv8i16( %a) { +; CHECK-LABEL: orv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: orv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.or.nxv8i16( %a) + ret i16 %res +} + +define i32 @orv_nxv4i32( %a) { +; CHECK-LABEL: orv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: orv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.or.nxv4i32( %a) + ret i32 %res +} + +define i64 @orv_nxv2i64( %a) { +; CHECK-LABEL: orv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.or.nxv2i64( %a) + ret i64 %res +} + +; XORV + +define i8 @xorv_nxv16i8( %a) { +; CHECK-LABEL: xorv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: eorv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.xor.nxv16i8( %a) + ret i8 %res +} + +define i16 @xorv_nxv8i16( %a) { +; CHECK-LABEL: xorv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: eorv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.xor.nxv8i16( %a) + ret i16 %res +} + +define i32 @xorv_nxv4i32( %a) { +; CHECK-LABEL: xorv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.xor.nxv4i32( %a) + ret i32 %res +} + +define i64 @xorv_nxv2i64( %a) { +; CHECK-LABEL: xorv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: eorv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.xor.nxv2i64( %a) + ret i64 %res +} + +; UADDV + +define i8 @uaddv_nxv16i8( %a) { +; CHECK-LABEL: uaddv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uaddv d0, p0, z0.b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.add.nxv16i8( %a) + ret i8 %res +} + +define i16 @uaddv_nxv8i16( %a) { +; CHECK-LABEL: uaddv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv8i16( %a) + ret i16 %res +} + +define i32 @uaddv_nxv4i32( %a) { +; CHECK-LABEL: uaddv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.add.nxv4i32( %a) + ret i32 %res +} + +define i64 @uaddv_nxv2i64( %a) { +; CHECK-LABEL: uaddv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uaddv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.add.nxv2i64( %a) + ret i64 %res +} + +; UMINV + +define i8 @umin_nxv16i8( %a) { +; CHECK-LABEL: umin_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uminv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.umin.nxv16i8( %a) + ret i8 %res +} + +define i16 @umin_nxv8i16( %a) { +; CHECK-LABEL: umin_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uminv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umin.nxv8i16( %a) + ret i16 %res +} + +define i32 @umin_nxv4i32( %a) { +; CHECK-LABEL: umin_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umin.nxv4i32( %a) + ret i32 %res +} + +define i64 @umin_nxv2i64( %a) { +; CHECK-LABEL: umin_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umin.nxv2i64( %a) + ret i64 %res +} + +; SMINV + +define i8 @smin_nxv16i8( %a) { +; CHECK-LABEL: smin_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: sminv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smin.nxv16i8( %a) + ret i8 %res +} + +define i16 @smin_nxv8i16( %a) { +; CHECK-LABEL: smin_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sminv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.smin.nxv8i16( %a) + ret i16 %res +} + +define i32 @smin_nxv4i32( %a) { +; CHECK-LABEL: smin_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smin.nxv4i32( %a) + ret i32 %res +} + +define i64 @smin_nxv2i64( %a) { +; CHECK-LABEL: smin_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smin.nxv2i64( %a) + ret i64 %res +} + +; UMAXV + +define i8 @umax_nxv16i8( %a) { +; CHECK-LABEL: umax_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: umaxv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.umax.nxv16i8( %a) + ret i8 %res +} + +define i16 @umax_nxv8i16( %a) { +; CHECK-LABEL: umax_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umax.nxv8i16( %a) + ret i16 %res +} + +define i32 @umax_nxv4i32( %a) { +; CHECK-LABEL: umax_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umax.nxv4i32( %a) + ret i32 %res +} + +define i64 @umax_nxv2i64( %a) { +; CHECK-LABEL: umax_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umax.nxv2i64( %a) + ret i64 %res +} + +; SMAXV + +define i8 @smax_nxv16i8( %a) { +; CHECK-LABEL: smax_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: smaxv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smax.nxv16i8( %a) + ret i8 %res +} + +define i16 @smax_nxv8i16( %a) { +; CHECK-LABEL: smax_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: smaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.smax.nxv8i16( %a) + ret i16 %res +} + +define i32 @smax_nxv4i32( %a) { +; CHECK-LABEL: smax_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smaxv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smax.nxv4i32( %a) + ret i32 %res +} + +define i64 @smax_nxv2i64( %a) { +; CHECK-LABEL: smax_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smax.nxv2i64( %a) + ret i64 %res +} + +declare i8 @llvm.vector.reduce.and.nxv16i8() +declare i16 @llvm.vector.reduce.and.nxv8i16() +declare i32 @llvm.vector.reduce.and.nxv4i32() +declare i64 @llvm.vector.reduce.and.nxv2i64() + +declare i8 @llvm.vector.reduce.or.nxv16i8() +declare i16 @llvm.vector.reduce.or.nxv8i16() +declare i32 @llvm.vector.reduce.or.nxv4i32() +declare i64 @llvm.vector.reduce.or.nxv2i64() + +declare i8 @llvm.vector.reduce.xor.nxv16i8() +declare i16 @llvm.vector.reduce.xor.nxv8i16() +declare i32 @llvm.vector.reduce.xor.nxv4i32() +declare i64 @llvm.vector.reduce.xor.nxv2i64() + +declare i8 @llvm.vector.reduce.add.nxv16i8() +declare i16 @llvm.vector.reduce.add.nxv8i16() +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64() + +declare i8 @llvm.vector.reduce.umin.nxv16i8() +declare i16 @llvm.vector.reduce.umin.nxv8i16() +declare i32 @llvm.vector.reduce.umin.nxv4i32() +declare i64 @llvm.vector.reduce.umin.nxv2i64() + +declare i8 @llvm.vector.reduce.smin.nxv16i8() +declare i16 @llvm.vector.reduce.smin.nxv8i16() +declare i32 @llvm.vector.reduce.smin.nxv4i32() +declare i64 @llvm.vector.reduce.smin.nxv2i64() + +declare i8 @llvm.vector.reduce.umax.nxv16i8() +declare i16 @llvm.vector.reduce.umax.nxv8i16() +declare i32 @llvm.vector.reduce.umax.nxv4i32() +declare i64 @llvm.vector.reduce.umax.nxv2i64() + +declare i8 @llvm.vector.reduce.smax.nxv16i8() +declare i16 @llvm.vector.reduce.smax.nxv8i16() +declare i32 @llvm.vector.reduce.smax.nxv4i32() +declare i64 @llvm.vector.reduce.smax.nxv2i64() Index: llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i1 @andv_nxv32i1( %a) { +; CHECK-LABEL: andv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.nxv32i1( %a) + ret i1 %res +} + +define i1 @andv_nxv64i1( %a) { +; CHECK-LABEL: andv_nxv64i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p4.b +; CHECK-NEXT: and p1.b, p4/z, p1.b, p3.b +; CHECK-NEXT: and p0.b, p4/z, p0.b, p2.b +; CHECK-NEXT: and p0.b, p4/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p4/z, p0.b +; CHECK-NEXT: ptest p4, p0.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.nxv64i1( %a) + ret i1 %res +} + +; ORV + +define i1 @orv_nxv32i1( %a) { +; CHECK-LABEL: orv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.nxv32i1( %a) + ret i1 %res +} + +; XORV + +define i1 @xorv_nxv32i1( %a) { +; CHECK-LABEL: xorv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: eor p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: cntp x8, p2, p0.b +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.nxv32i1( %a) + ret i1 %res +} + +; SMAXV + +define i1 @smaxv_nxv32i1( %a) { +; CHECK-LABEL: smaxv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.nxv32i1( %a) + ret i1 %res +} + +; SMINV + +define i1 @sminv_nxv32i1( %a) { +; CHECK-LABEL: sminv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.nxv32i1( %a) + ret i1 %res +} + +; UMAXV + +define i1 @umaxv_nxv32i1( %a) { +; CHECK-LABEL: umaxv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.nxv32i1( %a) + ret i1 %res +} + +; UMINV + +define i1 @uminv_nxv32i1( %a) { +; CHECK-LABEL: uminv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.nxv32i1( %a) + ret i1 %res +} + +declare i1 @llvm.vector.reduce.and.nxv32i1() +declare i1 @llvm.vector.reduce.and.nxv64i1() + +declare i1 @llvm.vector.reduce.or.nxv32i1() + +declare i1 @llvm.vector.reduce.xor.nxv32i1() + +declare i1 @llvm.vector.reduce.smax.nxv32i1() + +declare i1 @llvm.vector.reduce.smin.nxv32i1() + +declare i1 @llvm.vector.reduce.umax.nxv32i1() + +declare i1 @llvm.vector.reduce.umin.nxv32i1() Index: llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i8 @andv_nxv8i8( %a) { +; CHECK-LABEL: andv_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: andv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.and.nxv8i8( %a) + ret i8 %res +} + +define i32 @andv_nxv8i32( %a) { +; CHECK-LABEL: andv_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.and.nxv8i32( %a) + ret i32 %res +} + +; ORV + +define i32 @orv_nxv2i32( %a) { +; CHECK-LABEL: orv_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.or.nxv2i32( %a) + ret i32 %res +} + +define i64 @orv_nxv8i64( %a) { +; CHECK-LABEL: orv_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.or.nxv8i64( %a) + ret i64 %res +} + +; XORV + +define i16 @xorv_nxv2i16( %a) { +; CHECK-LABEL: xorv_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: eorv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.xor.nxv2i16( %a) + ret i16 %res +} + +define i32 @xorv_nxv8i32( %a) { +; CHECK-LABEL: xorv_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.xor.nxv8i32( %a) + ret i32 %res +} + +; UADDV + +define i16 @uaddv_nxv4i16( %a) { +; CHECK-LABEL: uaddv_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv4i16( %a) + ret i16 %res +} + +define i16 @uaddv_nxv16i16( %a) { +; CHECK-LABEL: uaddv_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv16i16( %a) + ret i16 %res +} + +define i32 @uaddv_nxv16i32( %a) { +; CHECK-LABEL: uaddv_nxv16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.add.nxv16i32( %a) + ret i32 %res +} + +; UMINV + +define i32 @umin_nxv2i32( %a) { +; CHECK-LABEL: umin_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umin.nxv2i32( %a) + ret i32 %res +} + +define i64 @umin_nxv4i64( %a) { +; CHECK-LABEL: umin_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umin.nxv4i64( %a) + ret i64 %res +} + +; SMINV + +define i8 @smin_nxv4i8( %a) { +; CHECK-LABEL: smin_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smin.nxv4i8( %a) + ret i8 %res +} + +define i32 @smin_nxv8i32( %a) { +; CHECK-LABEL: smin_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smin.nxv8i32( %a) + ret i32 %res +} + +; UMAXV + +define i16 @smin_nxv16i16( %a) { +; CHECK-LABEL: smin_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umax.nxv16i16( %a) + ret i16 %res +} + +; SMAXV + +define i64 @smin_nxv8i64( %a) { +; CHECK-LABEL: smin_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smax z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: smaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smax.nxv8i64( %a) + ret i64 %res +} + +declare i8 @llvm.vector.reduce.and.nxv8i8() +declare i32 @llvm.vector.reduce.and.nxv8i32() + +declare i32 @llvm.vector.reduce.or.nxv2i32() +declare i64 @llvm.vector.reduce.or.nxv8i64() + +declare i16 @llvm.vector.reduce.xor.nxv2i16() +declare i32 @llvm.vector.reduce.xor.nxv8i32() + +declare i16 @llvm.vector.reduce.add.nxv4i16() +declare i16 @llvm.vector.reduce.add.nxv16i16() +declare i32 @llvm.vector.reduce.add.nxv16i32() + +declare i32 @llvm.vector.reduce.umin.nxv2i32() +declare i64 @llvm.vector.reduce.umin.nxv4i64() + +declare i8 @llvm.vector.reduce.smin.nxv4i8() +declare i32 @llvm.vector.reduce.smin.nxv8i32() + +declare i16 @llvm.vector.reduce.umax.nxv16i16() + +declare i64 @llvm.vector.reduce.smax.nxv8i64()