Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20804,7 +20804,7 @@ unsigned Opcode = N->getOpcode(); // VECREDUCE over 1-element vector is just an extract. - if (VT.getVectorNumElements() == 1) { + if (VT.getVectorElementCount().isScalar()) { SDLoc dl(N); SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0, Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7982,6 +7982,10 @@ SDValue Op = Node->getOperand(0); EVT VT = Op.getValueType(); + if (VT.isScalableVector()) + report_fatal_error( + "Expanding reductions for scalable vectors is undefined."); + // Try to use a shuffle reduction for power of two vectors. if (VT.isPow2VectorType()) { while (VT.getVectorNumElements() > 1) { Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -931,8 +931,10 @@ SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFixedLengthReductionToSVE(unsigned Opcode, SDValue ScalarOp, - SelectionDAG &DAG) const; + SDValue LowerPredReductionToSVE(unsigned Opcode, SDValue ScalarOp, SDValue Pg, + SelectionDAG &DAG) const; + SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp, + SelectionDAG &DAG, bool OverrideNEON) const; SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1012,6 +1012,14 @@ setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); } // Illegal unpacked integer vector types. @@ -1026,6 +1034,8 @@ setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); // There are no legal MVT::nxv16f## based types. if (VT != MVT::nxv16i1) { @@ -9739,28 +9749,31 @@ Op.getOpcode() == ISD::VECREDUCE_XOR || (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); - if (useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + if (SrcVT.isScalableVector() || + useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: - return LowerFixedLengthReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_AND: - return LowerFixedLengthReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_OR: - return LowerFixedLengthReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_SMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_SMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_UMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_UMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_XOR: - return LowerFixedLengthReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG, OverrideNEON); case ISD::VECREDUCE_FMAX: - return LowerFixedLengthReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG, + OverrideNEON); case ISD::VECREDUCE_FMIN: - return LowerFixedLengthReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG); + return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG, + OverrideNEON); default: llvm_unreachable("Unhandled fixed length reduction"); } @@ -16129,15 +16142,49 @@ return convertFromScalableVector(DAG, VT, ScalableRes); } -SDValue AArch64TargetLowering::LowerFixedLengthReductionToSVE(unsigned Opcode, - SDValue ScalarOp, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerPredReductionToSVE(unsigned Opcode, + SDValue ScalarOp, + SDValue Pg, + SelectionDAG &DAG) const { + SDValue Op = ScalarOp.getOperand(0); + EVT OpVT = Op.getValueType(); + + if (!OpVT.isScalableVector() && OpVT.getVectorElementType() != MVT::i1) + return SDValue(); + + AArch64CC::CondCode Cond; + switch (ScalarOp.getOpcode()) { + default: + return SDValue(); + case ISD::VECREDUCE_OR: + Cond = AArch64CC::ANY_ACTIVE; + break; + case ISD::VECREDUCE_AND: + Cond = AArch64CC::NONE_ACTIVE; + Op = DAG.getNode(ISD::XOR, SDLoc(ScalarOp), OpVT, Op, Pg); + break; + } + + return getPTest(DAG, ScalarOp.getValueType(), Pg, Op, Cond); +} + +SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode, + SDValue ScalarOp, + SelectionDAG &DAG, + bool OverrideNEON) const { SDLoc DL(ScalarOp); SDValue VecOp = ScalarOp.getOperand(0); EVT SrcVT = VecOp.getValueType(); + if (useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) { + EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); + VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + } + SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); - EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); - VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + + if (SrcVT.getVectorElementType() == MVT::i1) + return LowerPredReductionToSVE(Opcode, ScalarOp, Pg, DAG); // UADDV always returns an i64 result. EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 : Index: llvm/test/CodeGen/AArch64/sve-int-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -0,0 +1,515 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; ANDV + +define i8 @andv_nxv16i8( %a) { +; CHECK-LABEL: andv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.and.nxv16i8( %a) + ret i8 %res +} + +define i16 @andv_nxv8i16( %a) { +; CHECK-LABEL: andv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: andv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.and.nxv8i16( %a) + ret i16 %res +} + +define i32 @andv_nxv4i32( %a) { +; CHECK-LABEL: andv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.and.nxv4i32( %a) + ret i32 %res +} + +define i64 @andv_nxv2i64( %a) { +; CHECK-LABEL: andv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: andv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.and.nxv2i64( %a) + ret i64 %res +} + +define i1 @reduce_and_nxv16i1( %vec) { +; CHECK-LABEL: reduce_and_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv8i1( %vec) { +; CHECK-LABEL: reduce_and_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv4i1( %vec) { +; CHECK-LABEL: reduce_and_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_and_nxv2i1( %vec) { +; CHECK-LABEL: reduce_and_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.i1.nxv2i1( %vec) + ret i1 %res +} + +; ORV + +define i8 @orv_nxv16i8( %a) { +; CHECK-LABEL: orv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.or.nxv16i8( %a) + ret i8 %res +} + +define i16 @orv_nxv8i16( %a) { +; CHECK-LABEL: orv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: orv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.or.nxv8i16( %a) + ret i16 %res +} + +define i32 @orv_nxv4i32( %a) { +; CHECK-LABEL: orv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: orv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.or.nxv4i32( %a) + ret i32 %res +} + +define i64 @orv_nxv2i64( %a) { +; CHECK-LABEL: orv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.or.nxv2i64( %a) + ret i64 %res +} + +define i1 @reduce_or_nxv16i1( %vec) { +; CHECK-LABEL: reduce_or_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv16i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv8i1( %vec) { +; CHECK-LABEL: reduce_or_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv8i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv4i1( %vec) { +; CHECK-LABEL: reduce_or_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv4i1( %vec) + ret i1 %res +} + +define i1 @reduce_or_nxv2i1( %vec) { +; CHECK-LABEL: reduce_or_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.i1.nxv2i1( %vec) + ret i1 %res +} + +; XORV + +define i8 @xorv_nxv16i8( %a) { +; CHECK-LABEL: xorv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: eorv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.xor.nxv16i8( %a) + ret i8 %res +} + +define i16 @xorv_nxv8i16( %a) { +; CHECK-LABEL: xorv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: eorv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.xor.nxv8i16( %a) + ret i16 %res +} + +define i32 @xorv_nxv4i32( %a) { +; CHECK-LABEL: xorv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.xor.nxv4i32( %a) + ret i32 %res +} + +define i64 @xorv_nxv2i64( %a) { +; CHECK-LABEL: xorv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: eorv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.xor.nxv2i64( %a) + ret i64 %res +} + +; UADDV + +define i8 @uaddv_nxv16i8( %a) { +; CHECK-LABEL: uaddv_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uaddv d0, p0, z0.b +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.add.nxv16i8( %a) + ret i8 %res +} + +define i16 @uaddv_nxv8i16( %a) { +; CHECK-LABEL: uaddv_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv8i16( %a) + ret i16 %res +} + +define i32 @uaddv_nxv4i32( %a) { +; CHECK-LABEL: uaddv_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.add.nxv4i32( %a) + ret i32 %res +} + +define i64 @uaddv_nxv2i64( %a) { +; CHECK-LABEL: uaddv_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uaddv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.add.nxv2i64( %a) + ret i64 %res +} + +; UMINV + +define i8 @umin_nxv16i8( %a) { +; CHECK-LABEL: umin_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uminv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.umin.nxv16i8( %a) + ret i8 %res +} + +define i16 @umin_nxv8i16( %a) { +; CHECK-LABEL: umin_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uminv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umin.nxv8i16( %a) + ret i16 %res +} + +define i32 @umin_nxv4i32( %a) { +; CHECK-LABEL: umin_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umin.nxv4i32( %a) + ret i32 %res +} + +define i64 @umin_nxv2i64( %a) { +; CHECK-LABEL: umin_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umin.nxv2i64( %a) + ret i64 %res +} + +; SMINV + +define i8 @smin_nxv16i8( %a) { +; CHECK-LABEL: smin_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: sminv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smin.nxv16i8( %a) + ret i8 %res +} + +define i16 @smin_nxv8i16( %a) { +; CHECK-LABEL: smin_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sminv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.smin.nxv8i16( %a) + ret i16 %res +} + +define i32 @smin_nxv4i32( %a) { +; CHECK-LABEL: smin_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smin.nxv4i32( %a) + ret i32 %res +} + +define i64 @smin_nxv2i64( %a) { +; CHECK-LABEL: smin_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smin.nxv2i64( %a) + ret i64 %res +} + +; UMAXV + +define i8 @umax_nxv16i8( %a) { +; CHECK-LABEL: umax_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: umaxv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.umax.nxv16i8( %a) + ret i8 %res +} + +define i16 @umax_nxv8i16( %a) { +; CHECK-LABEL: umax_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umax.nxv8i16( %a) + ret i16 %res +} + +define i32 @umax_nxv4i32( %a) { +; CHECK-LABEL: umax_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umax.nxv4i32( %a) + ret i32 %res +} + +define i64 @umax_nxv2i64( %a) { +; CHECK-LABEL: umax_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umax.nxv2i64( %a) + ret i64 %res +} + +; SMAXV + +define i8 @smax_nxv16i8( %a) { +; CHECK-LABEL: smax_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: smaxv b0, p0, z0.b +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smax.nxv16i8( %a) + ret i8 %res +} + +define i16 @smax_nxv8i16( %a) { +; CHECK-LABEL: smax_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: smaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.smax.nxv8i16( %a) + ret i16 %res +} + +define i32 @smax_nxv4i32( %a) { +; CHECK-LABEL: smax_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smaxv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smax.nxv4i32( %a) + ret i32 %res +} + +define i64 @smax_nxv2i64( %a) { +; CHECK-LABEL: smax_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smax.nxv2i64( %a) + ret i64 %res +} + +declare i8 @llvm.vector.reduce.and.nxv16i8() +declare i16 @llvm.vector.reduce.and.nxv8i16() +declare i32 @llvm.vector.reduce.and.nxv4i32() +declare i64 @llvm.vector.reduce.and.nxv2i64() + +declare i1 @llvm.vector.reduce.and.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.and.i1.nxv2i1( %vec) + +declare i8 @llvm.vector.reduce.or.nxv16i8() +declare i16 @llvm.vector.reduce.or.nxv8i16() +declare i32 @llvm.vector.reduce.or.nxv4i32() +declare i64 @llvm.vector.reduce.or.nxv2i64() + +declare i1 @llvm.vector.reduce.or.i1.nxv16i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv8i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv4i1( %vec) +declare i1 @llvm.vector.reduce.or.i1.nxv2i1( %vec) + +declare i8 @llvm.vector.reduce.xor.nxv16i8() +declare i16 @llvm.vector.reduce.xor.nxv8i16() +declare i32 @llvm.vector.reduce.xor.nxv4i32() +declare i64 @llvm.vector.reduce.xor.nxv2i64() + +declare i8 @llvm.vector.reduce.add.nxv16i8() +declare i16 @llvm.vector.reduce.add.nxv8i16() +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64() + +declare i8 @llvm.vector.reduce.umin.nxv16i8() +declare i16 @llvm.vector.reduce.umin.nxv8i16() +declare i32 @llvm.vector.reduce.umin.nxv4i32() +declare i64 @llvm.vector.reduce.umin.nxv2i64() + +declare i8 @llvm.vector.reduce.smin.nxv16i8() +declare i16 @llvm.vector.reduce.smin.nxv8i16() +declare i32 @llvm.vector.reduce.smin.nxv4i32() +declare i64 @llvm.vector.reduce.smin.nxv2i64() + +declare i8 @llvm.vector.reduce.umax.nxv16i8() +declare i16 @llvm.vector.reduce.umax.nxv8i16() +declare i32 @llvm.vector.reduce.umax.nxv4i32() +declare i64 @llvm.vector.reduce.umax.nxv2i64() + +declare i8 @llvm.vector.reduce.smax.nxv16i8() +declare i16 @llvm.vector.reduce.smax.nxv8i16() +declare i32 @llvm.vector.reduce.smax.nxv4i32() +declare i64 @llvm.vector.reduce.smax.nxv2i64() Index: llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll @@ -0,0 +1,280 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; ANDV + +define i8 @andv_nxv8i8( %a) { +; CHECK-LABEL: andv_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: andv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.and.nxv8i8( %a) + ret i8 %res +} + +define i32 @andv_nxv8i32( %a) { +; CHECK-LABEL: andv_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.and.nxv8i32( %a) + ret i32 %res +} + +define i1 @andv_nxv32i1( %a) { +; CHECK-LABEL: andv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: and p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.nxv32i1( %a) + ret i1 %res +} + +define i1 @andv_nxv64i1( %a) { +; CHECK-LABEL: andv_nxv64i1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p4.b +; CHECK-NEXT: and p1.b, p4/z, p1.b, p3.b +; CHECK-NEXT: and p0.b, p4/z, p0.b, p2.b +; CHECK-NEXT: and p0.b, p4/z, p0.b, p1.b +; CHECK-NEXT: not p0.b, p4/z, p0.b +; CHECK-NEXT: ptest p4, p0.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.and.nxv64i1( %a) + ret i1 %res +} + +; ORV + +define i32 @orv_nxv2i32( %a) { +; CHECK-LABEL: orv_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.or.nxv2i32( %a) + ret i32 %res +} + +define i64 @orv_nxv8i64( %a) { +; CHECK-LABEL: orv_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: orr z1.d, z1.d, z3.d +; CHECK-NEXT: orr z0.d, z0.d, z2.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: orv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.or.nxv8i64( %a) + ret i64 %res +} + +define i1 @orv_nxv32i1( %a) { +; CHECK-LABEL: orv_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: orr p0.b, p2/z, p0.b, p1.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %res = call i1 @llvm.vector.reduce.or.nxv32i1( %a) + ret i1 %res +} + +; XORV + +define i16 @xorv_nxv2i16( %a) { +; CHECK-LABEL: xorv_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: eorv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.xor.nxv2i16( %a) + ret i16 %res +} + +define i32 @xorv_nxv8i32( %a) { +; CHECK-LABEL: xorv_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.xor.nxv8i32( %a) + ret i32 %res +} + +; UADDV + +define i16 @uaddv_nxv4i16( %a) { +; CHECK-LABEL: uaddv_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv4i16( %a) + ret i16 %res +} + +define i16 @uaddv_nxv16i16( %a) { +; CHECK-LABEL: uaddv_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uaddv d0, p0, z0.h +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.add.nxv16i16( %a) + ret i16 %res +} + +define i32 @uaddv_nxv16i32( %a) { +; CHECK-LABEL: uaddv_nxv16i32: +; CHECK: // %bb.0: +; CHECK-NEXT: add z1.s, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uaddv d0, p0, z0.s +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.add.nxv16i32( %a) + ret i32 %res +} + +; UMINV + +define i32 @umin_nxv2i32( %a) { +; CHECK-LABEL: umin_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.umin.nxv2i32( %a) + ret i32 %res +} + +define i64 @umin_nxv4i64( %a) { +; CHECK-LABEL: umin_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: uminv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.umin.nxv4i64( %a) + ret i64 %res +} + +; SMINV + +define i8 @smin_nxv4i8( %a) { +; CHECK-LABEL: smin_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i8 @llvm.vector.reduce.smin.nxv4i8( %a) + ret i8 %res +} + +define i32 @smin_nxv8i32( %a) { +; CHECK-LABEL: smin_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sminv s0, p0, z0.s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i32 @llvm.vector.reduce.smin.nxv8i32( %a) + ret i32 %res +} + +; UMAXV + +define i16 @smin_nxv16i16( %a) { +; CHECK-LABEL: smin_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + %res = call i16 @llvm.vector.reduce.umax.nxv16i16( %a) + ret i16 %res +} + +; SMAXV + +define i64 @smin_nxv8i64( %a) { +; CHECK-LABEL: smin_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smax z1.d, p0/m, z1.d, z3.d +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: smaxv d0, p0, z0.d +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %res = call i64 @llvm.vector.reduce.smax.nxv8i64( %a) + ret i64 %res +} + +declare i8 @llvm.vector.reduce.and.nxv8i8() +declare i32 @llvm.vector.reduce.and.nxv8i32() +declare i1 @llvm.vector.reduce.and.nxv32i1() +declare i1 @llvm.vector.reduce.and.nxv64i1() + +declare i32 @llvm.vector.reduce.or.nxv2i32() +declare i64 @llvm.vector.reduce.or.nxv8i64() +declare i1 @llvm.vector.reduce.or.nxv32i1() + +declare i16 @llvm.vector.reduce.xor.nxv2i16() +declare i32 @llvm.vector.reduce.xor.nxv8i32() + +declare i16 @llvm.vector.reduce.add.nxv4i16() +declare i16 @llvm.vector.reduce.add.nxv16i16() +declare i32 @llvm.vector.reduce.add.nxv16i32() + +declare i32 @llvm.vector.reduce.umin.nxv2i32() +declare i64 @llvm.vector.reduce.umin.nxv4i64() + +declare i8 @llvm.vector.reduce.smin.nxv4i8() +declare i32 @llvm.vector.reduce.smin.nxv8i32() + +declare i16 @llvm.vector.reduce.umax.nxv16i16() + +declare i64 @llvm.vector.reduce.smax.nxv8i64()