diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20857,7 +20857,7 @@ unsigned Opcode = N->getOpcode(); // VECREDUCE over 1-element vector is just an extract. - if (VT.getVectorNumElements() == 1) { + if (VT.getVectorElementCount().isScalar()) { SDLoc dl(N); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3323,6 +3323,9 @@ SDValue InVec = Op.getOperand(0); SDValue EltNo = Op.getOperand(1); EVT VecVT = InVec.getValueType(); + // computeKnownBits not yet implemented for scalable vectors. + if (VecVT.isScalableVector()) + break; const unsigned EltBitWidth = VecVT.getScalarSizeInBits(); const unsigned NumSrcElts = VecVT.getVectorNumElements(); @@ -4809,6 +4812,16 @@ case ISD::VSCALE: assert(VT == Operand.getValueType() && "Unexpected VT!"); break; + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + if (Operand.getValueType().getScalarType() == MVT::i1) + return getNode(ISD::VECREDUCE_OR, DL, VT, Operand); + break; + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_UMIN: + if (Operand.getValueType().getScalarType() == MVT::i1) + return getNode(ISD::VECREDUCE_AND, DL, VT, Operand); + break; } SDNode *N; @@ -5318,10 +5331,6 @@ case ISD::MULHS: case ISD::SDIV: case ISD::SREM: - case ISD::SMIN: - case ISD::SMAX: - case ISD::UMIN: - case ISD::UMAX: case ISD::SADDSAT: case ISD::SSUBSAT: case ISD::UADDSAT: @@ -5330,6 +5339,22 @@ assert(N1.getValueType() == N2.getValueType() && N1.getValueType() == VT && "Binary operator types must match!"); break; + case ISD::SMIN: + case ISD::UMAX: + assert(VT.isInteger() && "This operator does not apply to FP types!"); + assert(N1.getValueType() == N2.getValueType() && + N1.getValueType() == VT && "Binary operator types must match!"); + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) + return getNode(ISD::OR, DL, VT, N1, N2); + break; + case ISD::SMAX: + case ISD::UMIN: + assert(VT.isInteger() && "This operator does not apply to FP types!"); + assert(N1.getValueType() == N2.getValueType() && + N1.getValueType() == VT && "Binary operator types must match!"); + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) + return getNode(ISD::AND, DL, VT, N1, N2); + break; case ISD::FADD: case ISD::FSUB: case ISD::FMUL: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8000,6 +8000,10 @@ SDValue Op = Node->getOperand(0); EVT VT = Op.getValueType(); + if (VT.isScalableVector()) + report_fatal_error( + "Expanding reductions for scalable vectors is undefined."); + // Try to use a shuffle reduction for power of two vectors. if (VT.isPow2VectorType()) { while (VT.getVectorNumElements() > 1) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -933,8 +933,9 @@ SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const; - SDValue LowerFixedLengthReductionToSVE(unsigned Opcode, SDValue ScalarOp, - SelectionDAG &DAG) const; + SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const; + SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp, + SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1013,6 +1013,14 @@ setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); } // Illegal unpacked integer vector types. @@ -1027,6 +1035,9 @@ setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); // There are no legal MVT::nxv16f## based types. if (VT != MVT::nxv16i1) { @@ -9815,30 +9826,35 @@ Op.getOpcode() == ISD::VECREDUCE_FADD || (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); - if (useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + if (SrcVT.isScalableVector() || + useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Op, DAG); + switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: - return LowerFixedLengthReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); case ISD::VECREDUCE_AND: - return LowerFixedLengthReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); case ISD::VECREDUCE_OR: - return LowerFixedLengthReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); case ISD::VECREDUCE_SMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); case ISD::VECREDUCE_SMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); case ISD::VECREDUCE_UMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); case ISD::VECREDUCE_UMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); case ISD::VECREDUCE_XOR: - return LowerFixedLengthReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); case ISD::VECREDUCE_FADD: - return LowerFixedLengthReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG); case ISD::VECREDUCE_FMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); case ISD::VECREDUCE_FMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); default: llvm_unreachable("Unhandled fixed length reduction"); } @@ -16333,20 +16349,56 @@ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero); } -SDValue AArch64TargetLowering::LowerFixedLengthReductionToSVE(unsigned Opcode, - SDValue ScalarOp, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp, + SelectionDAG &DAG) const { + SDLoc DL(ReduceOp); + SDValue Op = ReduceOp.getOperand(0); + EVT OpVT = Op.getValueType(); + EVT VT = ReduceOp.getValueType(); + + if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue Pg = getPredicateForVector(DAG, DL, OpVT); + + switch (ReduceOp.getOpcode()) { + default: + return SDValue(); + case ISD::VECREDUCE_OR: + return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE); + case ISD::VECREDUCE_AND: { + Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg); + return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE); + } + case ISD::VECREDUCE_XOR: { + SDValue ID = + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64); + SDValue Cntp = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op); + return DAG.getAnyExtOrTrunc(Cntp, DL, VT); + } + } + + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, + SDValue ScalarOp, + SelectionDAG &DAG) const { SDLoc DL(ScalarOp); SDValue VecOp = ScalarOp.getOperand(0); EVT SrcVT = VecOp.getValueType(); - SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); - VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + if (useSVEForFixedLengthVectorVT(SrcVT, true)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); + VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + } // UADDV always returns an i64 result. EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : SrcVT.getVectorElementType(); + SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); SDValue Rdx = DAG.getNode(Opcode, DL, getPackedSVEVectorVT(ResVT), Pg, VecOp); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, DAG.getConstant(0, DL, MVT::i64)); diff --git a/llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll @@ -0,0 +1,375 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i1 @reduce_and_nxv16i1( %vec) { +; CHECK-LABEL: reduce_and_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv8i1( %vec) { +; CHECK-LABEL: reduce_and_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv4i1( %vec) { +; CHECK-LABEL: reduce_and_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv2i1( %vec) { +; CHECK-LABEL: reduce_and_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv2i1( %vec) + ret i1 %res +} + +; ORV + +define i1 @reduce_or_nxv16i1( %vec) { +; CHECK-LABEL: reduce_or_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv8i1( %vec) { +; CHECK-LABEL: reduce_or_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv4i1( %vec) { +; CHECK-LABEL: reduce_or_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv2i1( %vec) { +; CHECK-LABEL: reduce_or_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv2i1( %vec) + ret i1 %res +} + +; XORV + +define i1 @reduce_xor_nxv16i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cntp x8, p1, p0.b +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_xor_nxv8i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: cntp x8, p1, p0.h +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_xor_nxv4i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: cntp x8, p1, p0.s +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_xor_nxv2i1( %vec) { +; CHECK-LABEL: reduce_xor_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: cntp x8, p1, p0.d +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.i1.nxv2i1( %vec) + ret i1 %res +} + +; SMAXV + +define i1 @reduce_smax_nxv16i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_smax_nxv8i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_smax_nxv4i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_smax_nxv2i1( %vec) { +; CHECK-LABEL: reduce_smax_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.i1.nxv2i1( %vec) + ret i1 %res +} + +; SMINV + +define i1 @reduce_smin_nxv16i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_smin_nxv8i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_smin_nxv4i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_smin_nxv2i1( %vec) { +; CHECK-LABEL: reduce_smin_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.i1.nxv2i1( %vec) + ret i1 %res +} + +; UMAXV + +define i1 @reduce_umax_nxv16i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_umax_nxv8i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_umax_nxv4i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_umax_nxv2i1( %vec) { +; CHECK-LABEL: reduce_umax_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.i1.nxv2i1( %vec) + ret i1 %res +} + +; UMINV + +define i1 @reduce_umin_nxv16i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_umin_nxv8i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_umin_nxv4i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_umin_nxv2i1( %vec) { +; CHECK-LABEL: reduce_umin_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.i1.nxv2i1( %vec) + ret i1 %res +} + +declare i1 @llvm.vector.reduce.and.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.or.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.xor.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.xor.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.xor.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.xor.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.smin.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.smin.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.smin.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.smin.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.smax.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.smax.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.smax.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.smax.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.umin.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.umin.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.umin.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.umin.i1.nxv2i1( %vec) + +declare i1 @llvm.vector.reduce.umax.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.umax.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.umax.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.umax.i1.nxv2i1( %vec) diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -0,0 +1,417 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i8 @andv_nxv16i8( %a) { +; CHECK-LABEL: andv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.and.nxv16i8( %a) + ret i8 %res +} + +define i16 @andv_nxv8i16( %a) { +; CHECK-LABEL: andv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: andv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.and.nxv8i16( %a) + ret i16 %res +} + +define i32 @andv_nxv4i32( %a) { +; CHECK-LABEL: andv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.and.nxv4i32( %a) + ret i32 %res +} + +define i64 @andv_nxv2i64( %a) { +; CHECK-LABEL: andv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: andv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.and.nxv2i64( %a) + ret i64 %res +} + +; ORV + +define i8 @orv_nxv16i8( %a) { +; CHECK-LABEL: orv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.or.nxv16i8( %a) + ret i8 %res +} + +define i16 @orv_nxv8i16( %a) { +; CHECK-LABEL: orv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: orv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.or.nxv8i16( %a) + ret i16 %res +} + +define i32 @orv_nxv4i32( %a) { +; CHECK-LABEL: orv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: orv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.or.nxv4i32( %a) + ret i32 %res +} + +define i64 @orv_nxv2i64( %a) { +; CHECK-LABEL: orv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.or.nxv2i64( %a) + ret i64 %res +} + +; XORV + +define i8 @xorv_nxv16i8( %a) { +; CHECK-LABEL: xorv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: eorv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.xor.nxv16i8( %a) + ret i8 %res +} + +define i16 @xorv_nxv8i16( %a) { +; CHECK-LABEL: xorv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: eorv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.xor.nxv8i16( %a) + ret i16 %res +} + +define i32 @xorv_nxv4i32( %a) { +; CHECK-LABEL: xorv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.xor.nxv4i32( %a) + ret i32 %res +} + +define i64 @xorv_nxv2i64( %a) { +; CHECK-LABEL: xorv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: eorv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.xor.nxv2i64( %a) + ret i64 %res +} + +; UADDV + +define i8 @uaddv_nxv16i8( %a) { +; CHECK-LABEL: uaddv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uaddv d0, p0, z0.b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.add.nxv16i8( %a) + ret i8 %res +} + +define i16 @uaddv_nxv8i16( %a) { +; CHECK-LABEL: uaddv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv8i16( %a) + ret i16 %res +} + +define i32 @uaddv_nxv4i32( %a) { +; CHECK-LABEL: uaddv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.add.nxv4i32( %a) + ret i32 %res +} + +define i64 @uaddv_nxv2i64( %a) { +; CHECK-LABEL: uaddv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uaddv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.add.nxv2i64( %a) + ret i64 %res +} + +; UMINV + +define i8 @umin_nxv16i8( %a) { +; CHECK-LABEL: umin_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uminv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.umin.nxv16i8( %a) + ret i8 %res +} + +define i16 @umin_nxv8i16( %a) { +; CHECK-LABEL: umin_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uminv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umin.nxv8i16( %a) + ret i16 %res +} + +define i32 @umin_nxv4i32( %a) { +; CHECK-LABEL: umin_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umin.nxv4i32( %a) + ret i32 %res +} + +define i64 @umin_nxv2i64( %a) { +; CHECK-LABEL: umin_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umin.nxv2i64( %a) + ret i64 %res +} + +; SMINV + +define i8 @smin_nxv16i8( %a) { +; CHECK-LABEL: smin_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: sminv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smin.nxv16i8( %a) + ret i8 %res +} + +define i16 @smin_nxv8i16( %a) { +; CHECK-LABEL: smin_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sminv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.smin.nxv8i16( %a) + ret i16 %res +} + +define i32 @smin_nxv4i32( %a) { +; CHECK-LABEL: smin_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smin.nxv4i32( %a) + ret i32 %res +} + +define i64 @smin_nxv2i64( %a) { +; CHECK-LABEL: smin_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smin.nxv2i64( %a) + ret i64 %res +} + +; UMAXV + +define i8 @umax_nxv16i8( %a) { +; CHECK-LABEL: umax_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: umaxv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.umax.nxv16i8( %a) + ret i8 %res +} + +define i16 @umax_nxv8i16( %a) { +; CHECK-LABEL: umax_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umax.nxv8i16( %a) + ret i16 %res +} + +define i32 @umax_nxv4i32( %a) { +; CHECK-LABEL: umax_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umax.nxv4i32( %a) + ret i32 %res +} + +define i64 @umax_nxv2i64( %a) { +; CHECK-LABEL: umax_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umax.nxv2i64( %a) + ret i64 %res +} + +; SMAXV + +define i8 @smax_nxv16i8( %a) { +; CHECK-LABEL: smax_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: smaxv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smax.nxv16i8( %a) + ret i8 %res +} + +define i16 @smax_nxv8i16( %a) { +; CHECK-LABEL: smax_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: smaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.smax.nxv8i16( %a) + ret i16 %res +} + +define i32 @smax_nxv4i32( %a) { +; CHECK-LABEL: smax_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smaxv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smax.nxv4i32( %a) + ret i32 %res +} + +define i64 @smax_nxv2i64( %a) { +; CHECK-LABEL: smax_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smax.nxv2i64( %a) + ret i64 %res +} + +declare i8 @llvm.vector.reduce.and.nxv16i8() +declare i16 @llvm.vector.reduce.and.nxv8i16() +declare i32 @llvm.vector.reduce.and.nxv4i32() +declare i64 @llvm.vector.reduce.and.nxv2i64() + +declare i8 @llvm.vector.reduce.or.nxv16i8() +declare i16 @llvm.vector.reduce.or.nxv8i16() +declare i32 @llvm.vector.reduce.or.nxv4i32() +declare i64 @llvm.vector.reduce.or.nxv2i64() + +declare i8 @llvm.vector.reduce.xor.nxv16i8() +declare i16 @llvm.vector.reduce.xor.nxv8i16() +declare i32 @llvm.vector.reduce.xor.nxv4i32() +declare i64 @llvm.vector.reduce.xor.nxv2i64() + +declare i8 @llvm.vector.reduce.add.nxv16i8() +declare i16 @llvm.vector.reduce.add.nxv8i16() +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64() + +declare i8 @llvm.vector.reduce.umin.nxv16i8() +declare i16 @llvm.vector.reduce.umin.nxv8i16() +declare i32 @llvm.vector.reduce.umin.nxv4i32() +declare i64 @llvm.vector.reduce.umin.nxv2i64() + +declare i8 @llvm.vector.reduce.smin.nxv16i8() +declare i16 @llvm.vector.reduce.smin.nxv8i16() +declare i32 @llvm.vector.reduce.smin.nxv4i32() +declare i64 @llvm.vector.reduce.smin.nxv2i64() + +declare i8 @llvm.vector.reduce.umax.nxv16i8() +declare i16 @llvm.vector.reduce.umax.nxv8i16() +declare i32 @llvm.vector.reduce.umax.nxv4i32() +declare i64 @llvm.vector.reduce.umax.nxv2i64() + +declare i8 @llvm.vector.reduce.smax.nxv16i8() +declare i16 @llvm.vector.reduce.smax.nxv8i16() +declare i32 @llvm.vector.reduce.smax.nxv4i32() +declare i64 @llvm.vector.reduce.smax.nxv2i64() diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i1 @andv_nxv32i1( %a) { +; CHECK-LABEL: andv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.nxv32i1( %a) + ret i1 %res +} + +define i1 @andv_nxv64i1( %a) { +; CHECK-LABEL: andv_nxv64i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p4.b +; CHECK-NEXT: and p1.b, p4/z, p1.b, p3.b +; CHECK-NEXT: and p0.b, p4/z, p0.b, p2.b +; CHECK-NEXT: and p0.b, p4/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p4/z, p0.b +; CHECK-NEXT: ptest p4, p0.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.nxv64i1( %a) + ret i1 %res +} + +; ORV + +define i1 @orv_nxv32i1( %a) { +; CHECK-LABEL: orv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.nxv32i1( %a) + ret i1 %res +} + +; XORV + +define i1 @xorv_nxv32i1( %a) { +; CHECK-LABEL: xorv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: eor p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: cntp x8, p2, p0.b +; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.xor.nxv32i1( %a) + ret i1 %res +} + +; SMAXV + +define i1 @smaxv_nxv32i1( %a) { +; CHECK-LABEL: smaxv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smax.nxv32i1( %a) + ret i1 %res +} + +; SMINV + +define i1 @sminv_nxv32i1( %a) { +; CHECK-LABEL: sminv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.smin.nxv32i1( %a) + ret i1 %res +} + +; UMAXV + +define i1 @umaxv_nxv32i1( %a) { +; CHECK-LABEL: umaxv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umax.nxv32i1( %a) + ret i1 %res +} + +; UMINV + +define i1 @uminv_nxv32i1( %a) { +; CHECK-LABEL: uminv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.umin.nxv32i1( %a) + ret i1 %res +} + +declare i1 @llvm.vector.reduce.and.nxv32i1() +declare i1 @llvm.vector.reduce.and.nxv64i1() + +declare i1 @llvm.vector.reduce.or.nxv32i1() + +declare i1 @llvm.vector.reduce.xor.nxv32i1() + +declare i1 @llvm.vector.reduce.smax.nxv32i1() + +declare i1 @llvm.vector.reduce.smin.nxv32i1() + +declare i1 @llvm.vector.reduce.umax.nxv32i1() + +declare i1 @llvm.vector.reduce.umin.nxv32i1() diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +; ANDV + +define i8 @andv_nxv8i8( %a) { +; CHECK-LABEL: andv_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: andv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.and.nxv8i8( %a) + ret i8 %res +} + +define i32 @andv_nxv8i32( %a) { +; CHECK-LABEL: andv_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.and.nxv8i32( %a) + ret i32 %res +} + +; ORV + +define i32 @orv_nxv2i32( %a) { +; CHECK-LABEL: orv_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.or.nxv2i32( %a) + ret i32 %res +} + +define i64 @orv_nxv8i64( %a) { +; CHECK-LABEL: orv_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.or.nxv8i64( %a) + ret i64 %res +} + +; XORV + +define i16 @xorv_nxv2i16( %a) { +; CHECK-LABEL: xorv_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: eorv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.xor.nxv2i16( %a) + ret i16 %res +} + +define i32 @xorv_nxv8i32( %a) { +; CHECK-LABEL: xorv_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.xor.nxv8i32( %a) + ret i32 %res +} + +; UADDV + +define i16 @uaddv_nxv4i16( %a) { +; CHECK-LABEL: uaddv_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv4i16( %a) + ret i16 %res +} + +define i16 @uaddv_nxv16i16( %a) { +; CHECK-LABEL: uaddv_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv16i16( %a) + ret i16 %res +} + +define i32 @uaddv_nxv16i32( %a) { +; CHECK-LABEL: uaddv_nxv16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.add.nxv16i32( %a) + ret i32 %res +} + +; UMINV + +define i32 @umin_nxv2i32( %a) { +; CHECK-LABEL: umin_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umin.nxv2i32( %a) + ret i32 %res +} + +define i64 @umin_nxv4i64( %a) { +; CHECK-LABEL: umin_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umin.nxv4i64( %a) + ret i64 %res +} + +; SMINV + +define i8 @smin_nxv4i8( %a) { +; CHECK-LABEL: smin_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smin.nxv4i8( %a) + ret i8 %res +} + +define i32 @smin_nxv8i32( %a) { +; CHECK-LABEL: smin_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smin.nxv8i32( %a) + ret i32 %res +} + +; UMAXV + +define i16 @smin_nxv16i16( %a) { +; CHECK-LABEL: smin_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umax.nxv16i16( %a) + ret i16 %res +} + +; SMAXV + +define i64 @smin_nxv8i64( %a) { +; CHECK-LABEL: smin_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smax z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: smaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smax.nxv8i64( %a) + ret i64 %res +} + +declare i8 @llvm.vector.reduce.and.nxv8i8() +declare i32 @llvm.vector.reduce.and.nxv8i32() + +declare i32 @llvm.vector.reduce.or.nxv2i32() +declare i64 @llvm.vector.reduce.or.nxv8i64() + +declare i16 @llvm.vector.reduce.xor.nxv2i16() +declare i32 @llvm.vector.reduce.xor.nxv8i32() + +declare i16 @llvm.vector.reduce.add.nxv4i16() +declare i16 @llvm.vector.reduce.add.nxv16i16() +declare i32 @llvm.vector.reduce.add.nxv16i32() + +declare i32 @llvm.vector.reduce.umin.nxv2i32() +declare i64 @llvm.vector.reduce.umin.nxv4i64() + +declare i8 @llvm.vector.reduce.smin.nxv4i8() +declare i32 @llvm.vector.reduce.smin.nxv8i32() + +declare i16 @llvm.vector.reduce.umax.nxv16i16() + +declare i64 @llvm.vector.reduce.smax.nxv8i64() diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -142,10 +142,14 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind { ; CHECK-LABEL: test_v4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.4h, #1 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umaxv h0, v0.4h -; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: umov w11, v0.h[0] +; CHECK-NEXT: umov w9, v0.h[2] +; CHECK-NEXT: orr w10, w11, w10 +; CHECK-NEXT: umov w8, v0.h[3] +; CHECK-NEXT: orr w9, w10, w9 +; CHECK-NEXT: orr w8, w9, w8 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)