Index: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1151,6 +1151,11 @@ break; case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_SEQ_FADD: + case ISD::VECREDUCE_SEQ_FMUL: + Action = TLI.getOperationAction( + Node->getOpcode(), Node->getOperand(1).getValueType()); + break; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_MUL: case ISD::VECREDUCE_AND: Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -931,6 +931,8 @@ SDValue LowerFixedLengthVectorIntExtendToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthReductionAccToSVE(unsigned Opcode, SDValue ScalarOp, + SelectionDAG &DAG) const; SDValue LowerFixedLengthReductionToSVE(unsigned Opcode, SDValue ScalarOp, SelectionDAG &DAG) const; SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1118,12 +1118,20 @@ setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom); setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom); + + // Int operations with no NEON 64b/128b support. for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, MVT::v4i32, MVT::v2i64}) { setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); } + + // FP operations with no NEON 64b/128b support. + for (auto VT : {MVT::v2f32, MVT::v4f32, MVT::v1f64, MVT::v2f64}) + // FIXME: There is a NEON FADDA instruction, but it's not + // currently supported. + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); } } @@ -1263,6 +1271,7 @@ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); @@ -3951,6 +3960,7 @@ case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_SEQ_FADD: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return LowerVECREDUCE(Op, DAG); @@ -9730,11 +9740,14 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { - SDValue Src = Op.getOperand(0); + // The first argument of VECREDUCE_SEQ_FADD is the initial value. + SDValue Src = (Op.getOpcode() == ISD::VECREDUCE_SEQ_FADD) ? Op.getOperand(1) + : Op.getOperand(0); // Try to lower fixed length reductions to SVE. EVT SrcVT = Src.getValueType(); - bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND || + bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_SEQ_FADD || + Op.getOpcode() == ISD::VECREDUCE_AND || Op.getOpcode() == ISD::VECREDUCE_OR || Op.getOpcode() == ISD::VECREDUCE_XOR || (Op.getOpcode() != ISD::VECREDUCE_ADD && @@ -9757,6 +9770,8 @@ return LowerFixedLengthReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG); case ISD::VECREDUCE_XOR: return LowerFixedLengthReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG); + case ISD::VECREDUCE_SEQ_FADD: + return LowerFixedLengthReductionAccToSVE(AArch64ISD::FADDA_PRED, Op, DAG); case ISD::VECREDUCE_FMAX: return LowerFixedLengthReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG); case ISD::VECREDUCE_FMIN: @@ -16129,6 +16144,31 @@ return convertFromScalableVector(DAG, VT, ScalableRes); } +SDValue AArch64TargetLowering::LowerFixedLengthReductionAccToSVE( + unsigned Opcode, SDValue ScalarOp, SelectionDAG &DAG) const { + SDLoc DL(ScalarOp); + SDValue AccOp = ScalarOp.getOperand(0); + SDValue VecOp = ScalarOp.getOperand(1); + + // Convert operands to Scalable. + EVT SrcVT = VecOp.getValueType(); + EVT ResVT = SrcVT.getVectorElementType(); + SDValue Pg = getPredicateForVector(DAG, DL, SrcVT); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT); + // FIXME: This produces a masked MOV. Do we care about the false lanes? + AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, + DAG.getUNDEF(ContainerVT), AccOp, + DAG.getConstant(0, DL, MVT::i64)); + VecOp = convertToScalableVector(DAG, ContainerVT, VecOp); + + // Perform reduction. + SDValue Rdx = DAG.getNode(Opcode, DL, getPackedSVEVectorVT(ResVT), Pg, + AccOp, VecOp); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, + DAG.getConstant(0, DL, MVT::i64)); +} + SDValue AArch64TargetLowering::LowerFixedLengthReductionToSVE(unsigned Opcode, SDValue ScalarOp, SelectionDAG &DAG) const { SDLoc DL(ScalarOp); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -220,6 +220,9 @@ bool shouldExpandReduction(const IntrinsicInst *II) const { switch (II->getIntrinsicID()) { case Intrinsic::vector_reduce_fadd: + if (ST->hasSVE()) + return false; + LLVM_FALLTHROUGH; case Intrinsic::vector_reduce_fmul: // We don't have legalization support for ordered FP reductions. return !II->getFastMathFlags().allowReassoc(); Index: llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll @@ -20,6 +20,167 @@ ; Don't use SVE when its registers are no bigger than NEON. ; NO_SVE-NOT: ptrue +; +; FADDA +; + +; NEON 64-bit FADDA not currently supported. +define float @fadda_v2f32(float %start, <2 x float> %a) #0 { +; CHECK-LABEL: fadda_v2f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl1 +; CHECK-NEXT: mov z[[ACC:[0-9]+]].s, [[PG]]/m, z0.s +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl2 +; CHECK-NEXT: fadda s0, [[PG1]], s[[ACC]], z1.s +; CHECK-NEXT: ret + %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) + ret float %res +} + +; NEON 128-bit FADDA not currently supported. +define float @fadda_v4f32(float %start, <4 x float> %a) #0 { +; CHECK-LABEL: fadda_v4f32: +; CHECK: ptrue [[PG1:p[0-9]+]].s, vl1 +; CHECK-NEXT: mov z[[ACC:[0-9]+]].s, [[PG1]]/m, z0.s +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl4 +; CHECK-NEXT: fadda s0, [[PG]], s[[ACC]], z1.s +; CHECK-NEXT: ret + %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) + ret float %res +} + +define float @fadda_v8f32(float %start, <8 x float>* %a) #0 { +; CHECK-LABEL: fadda_v8f32: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_GE_256-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ptrue [[PG1:p[0-9]+]].s, vl1 +; VBITS_GE_256-NEXT: mov z[[ACC:[0-9]+]].s, [[PG1]]/m, z0.s +; VBITS_GE_256-NEXT: Xfadda s0, [[PG]], s[[ACC]], [[OP]].s +; VBITS_GE_256-NEXT: ret + %op = load <8 x float>, <8 x float>* %a + %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) + ret float %res +} + +define float @fadda_v16f32(float %start, <16 x float>* %a) #0 { +; CHECK-LABEL: fadda_v16f32: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].s, vl1 +; VBITS_GE_512-NEXT: mov z[[ACC:[0-9]+]].s, [[PG1]]/m, z0.s +; VBITS_GE_512-NEXT: fadda s0, [[PG]], s[[ACC]], [[OP]].s +; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; FIXME: Legalisation is broken. + %op = load <16 x float>, <16 x float>* %a + %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op) + ret float %res +} + +define float @fadda_v32f32(float %start, <32 x float>* %a) #0 { +; CHECK-LABEL: fadda_v32f32: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].s, vl1 +; VBITS_GE_1024-NEXT: mov z[[ACC:[0-9]+]].s, [[PG1]]/m, z0.s +; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s[[ACC]], [[OP]].s +; VBITS_GE_1024-NEXT: ret + %op = load <32 x float>, <32 x float>* %a + %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op) + ret float %res +} + +define float @fadda_v64f32(float %start, <64 x float>* %a) #0 { +; CHECK-LABEL: fadda_v64f32: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl1 +; VBITS_GE_2048-NEXT: mov z[[ACC:[0-9]+]].s, [[PG1]]/m, z0.s +; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s[[ACC]], [[OP]].s +; VBITS_GE_2048-NEXT: ret + %op = load <64 x float>, <64 x float>* %a + %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op) + ret float %res +} + +; NEON 64-bit FADDA not currently supported. +define double @fadda_v1f64(double %start, <1 x double> %a) #0 { +; CHECK-LABEL: fadda_v1f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl1 +; CHECK-NEXT: mov z[[ACC:[0-9]+]].d, [[PG]]/m, z0.d +; CHECK-NEXT: fadda d0, [[PG]], d[[ACC]], z1.d +; CHECK-NEXT: ret + %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a) + ret double %res +} + +; NEON 128-bit FADDA not currently supported. +define double @fadda_v2f64(double %start, <2 x double> %a) #0 { +; CHECK-LABEL: fadda_v2f64: +; CHECK: ptrue [[PG1:p[0-9]+]].d, vl1 +; CHECK-NEXT: mov z[[ACC:[0-9]+]].d, [[PG1]]/m, z0.d +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl2 +; CHECK-NEXT: fadda d0, [[PG]], d[[ACC]], z1.d +; CHECK-NEXT: ret + %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) + ret double %res +} + +define double @fadda_v4f64(double %start, <4 x double>* %a) #0 { +; CHECK-LABEL: fadda_v4f64: +; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_GE_256-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl1 +; VBITS_GE_256-NEXT: mov z[[ACC:[0-9]+]].d, [[PG1]]/m, z0.d +; VBITS_GE_256-NEXT: fadda d0, [[PG]], d[[ACC]], [[OP]].d +; VBITS_GE_256-NEXT: ret + %op = load <4 x double>, <4 x double>* %a + %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) + ret double %res +} + +define double @fadda_v8f64(double %start, <8 x double>* %a) #0 { +; CHECK-LABEL: fadda_v8f64: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl1 +; VBITS_GE_512-NEXT: mov z[[ACC:[0-9]+]].d, [[PG1]]/m, z0.d +; VBITS_GE_512-NEXT: fadda d0, [[PG]], d[[ACC]], [[OP]].d +; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation. +; FIXME: Legalisation is broken. + %op = load <8 x double>, <8 x double>* %a + %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op) + ret double %res +} + +define double @fadda_v16f64(double %start, <16 x double>* %a) #0 { +; CHECK-LABEL: fadda_v16f64: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl1 +; VBITS_GE_1024-NEXT: mov z[[ACC:[0-9]+]].d, [[PG1]]/m, z0.d +; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d[[ACC]], [[OP]].d +; VBITS_GE_1024-NEXT: ret + %op = load <16 x double>, <16 x double>* %a + %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op) + ret double %res +} + +define double @fadda_v32f64(double %start, <32 x double>* %a) #0 { +; CHECK-LABEL: fadda_v32f64: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl1 +; VBITS_GE_2048-NEXT: mov z[[ACC:[0-9]+]].d, [[PG1]]/m, z0.d +; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d[[ACC]], [[OP]].d +; VBITS_GE_2048-NEXT: ret + %op = load <32 x double>, <32 x double>* %a + %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op) + ret double %res +} + ; ; FMAXV ; @@ -456,6 +617,20 @@ attributes #0 = { "target-features"="+sve" } +declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>) +declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>) +declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>) + +declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) +declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>) +declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>) +declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>) + declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>) declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)