Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -280,6 +280,10 @@ SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>, SDTCisPtrTy<4>, SDTCisPtrTy<5> ]>; +def SDT_FPVecReduce : SDTypeProfile<1, 1, [ + SDTCisFP<0>, SDTCisVec<1> +]>; + class SDCallSeqStart constraints> : SDTypeProfile<0, 2, constraints>; class SDCallSeqEnd constraints> : @@ -565,6 +569,9 @@ def extract_subvector : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTSubVecExtract, []>; def insert_subvector : SDNode<"ISD::INSERT_SUBVECTOR", SDTSubVecInsert, []>; +// Operators for vecreduce nodes. +def vec_reduce_fadd : SDNode<"ISD::VECREDUCE_FADD", SDT_FPVecReduce>; + // Nodes for intrinsics, you should use the intrinsic itself and let tblgen use // these internally. Don't reference these directly. def intrinsic_void : SDNode<"ISD::INTRINSIC_VOID", Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -710,6 +710,9 @@ MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + + if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) + setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); } setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -4354,6 +4354,23 @@ defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; + +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (vec_reduce_fadd (v8f16 V128:$Rn))), + (FADDPv2i16p + (EXTRACT_SUBREG + (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), (v8f16 (IMPLICIT_DEF))), + dsub))>; +def : Pat<(f16 (vec_reduce_fadd (v4f16 V64:$Rn))), + (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>; +} +def : Pat<(f32 (vec_reduce_fadd (v4f32 V128:$Rn))), + (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; +def : Pat<(f32 (vec_reduce_fadd (v2f32 V64:$Rn))), + (FADDPv2i32p V64:$Rn)>; +def : Pat<(f64 (vec_reduce_fadd (v2f64 V128:$Rn))), + (FADDPv2i64p V128:$Rn)>; + def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), Index: test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -49,9 +49,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: mov v0.s[3], v1.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b @@ -74,9 +73,8 @@ ; CHECK-NEXT: fadd v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b Index: test/CodeGen/AArch64/vecreduce-fadd.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd.ll +++ test/CodeGen/AArch64/vecreduce-fadd.ll @@ -5,39 +5,30 @@ ; CHECK-LABEL: add_HalfS: ; CHECK: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float> undef, <2 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float undef, <2 x float> %bin.rdx) ret float %r } define half @add_HalfH(<4 x half> %bin.rdx) { ; CHECK-LABEL: add_HalfH: -; CHECK: mov h3, v0.h[1] -; CHECK-NEXT: mov h1, v0.h[3] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h0, h0, h3 -; CHECK-NEXT: fadd h0, h0, h2 -; CHECK-NEXT: fadd h0, h0, h1 +; CHECK: faddp v0.4h, v0.4h, v0.4h +; CHECK-NEXT: faddp h0, v0.2h ; CHECK-NEXT: ret ; CHECKNOFP16-LABEL: add_HalfH: ; CHECKNOFP16-NOT: faddp ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half> undef, <4 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half undef, <4 x half> %bin.rdx) ret half %r } define half @add_H(<8 x half> %bin.rdx) { ; CHECK-LABEL: add_H: -; CHECK: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h1, h0, h1 -; CHECK-NEXT: fadd h1, h1, h2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fadd h0, h1, h0 +; CHECK: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp h0, v0.2h ; CHECK-NEXT: ret ; CHECKNOFP16-LABEL: add_H: @@ -45,17 +36,16 @@ ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half> undef, <8 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half undef, <8 x half> %bin.rdx) ret half %r } define float @add_S(<4 x float> %bin.rdx) { ; CHECK-LABEL: add_S: -; CHECK: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK: faddp v0.4s, v0.4s, v0.4s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float> undef, <4 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %bin.rdx) ret float %r } @@ -63,39 +53,33 @@ ; CHECK-LABEL: add_D: ; CHECK: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double> undef, <2 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double undef, <2 x double> %bin.rdx) ret double %r } define half @add_2H(<16 x half> %bin.rdx) { ; CHECK-LABEL: add_2H: ; CHECK: fadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h1, h0, h1 -; CHECK-NEXT: fadd h1, h1, h2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fadd h0, h1, h0 +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp h0, v0.2h ; CHECK-NEXT: ret ; CHECKNOFP16-LABEL: add_2H: ; CHECKNOFP16-NOT: faddp ; CHECKNOFP16-NOT: fadd h{{[0-9]+}} ; CHECKNOFP16-NOT: fadd v{{[0-9]+}}.{{[0-9]}}h ; CHECKNOFP16: ret - %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half> undef, <16 x half> %bin.rdx) + %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half undef, <16 x half> %bin.rdx) ret half %r } define float @add_2S(<8 x float> %bin.rdx) { ; CHECK-LABEL: add_2S: -; CHECK: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret - %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float> undef, <8 x float> %bin.rdx) + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float undef, <8 x float> %bin.rdx) ret float %r } @@ -104,16 +88,16 @@ ; CHECK: fadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: faddp d0, v0.2d ; CHECK-NEXT: ret - %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double> undef, <4 x double> %bin.rdx) + %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double undef, <4 x double> %bin.rdx) ret double %r } ; Function Attrs: nounwind readnone -declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half>, <4 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half>, <8 x half>) -declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half>, <16 x half>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float>, <2 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float>, <4 x float>) -declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float>, <8 x float>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double>, <2 x double>) -declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double>, <4 x double>) +declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(half, <4 x half>) +declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(half, <8 x half>) +declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(half, <16 x half>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(float, <2 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(float, <8 x float>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double, <2 x double>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(double, <4 x double>)