Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -280,6 +280,10 @@ SDTCisVT<2, OtherVT>, SDTCisVT<3, OtherVT>, SDTCisPtrTy<4>, SDTCisPtrTy<5> ]>; +def SDT_FPVecReduce : SDTypeProfile<1, 1, [ + SDTCisFP<0>, SDTCisVec<1> +]>; + class SDCallSeqStart constraints> : SDTypeProfile<0, 2, constraints>; class SDCallSeqEnd constraints> : @@ -565,6 +569,9 @@ def extract_subvector : SDNode<"ISD::EXTRACT_SUBVECTOR", SDTSubVecExtract, []>; def insert_subvector : SDNode<"ISD::INSERT_SUBVECTOR", SDTSubVecInsert, []>; +// Operators for vecreduce nodes. +def vec_reduce_fadd : SDNode<"ISD::VECREDUCE_FADD", SDT_FPVecReduce>; + // Nodes for intrinsics, you should use the intrinsic itself and let tblgen use // these internally. Don't reference these directly. def intrinsic_void : SDNode<"ISD::INTRINSIC_VOID", Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -710,6 +710,9 @@ MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + + if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) + setOperationAction(ISD::VECREDUCE_FADD, VT, Legal); } setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -4354,6 +4354,23 @@ defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; + +let Predicates = [HasFullFP16] in { +def : Pat<(f16 (vec_reduce_fadd (v8f16 V128:$Rn))), + (FADDPv2i16p + (EXTRACT_SUBREG + (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), (v8f16 (IMPLICIT_DEF))), + dsub))>; +def : Pat<(f16 (vec_reduce_fadd (v4f16 V64:$Rn))), + (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn))>; +} +def : Pat<(f32 (vec_reduce_fadd (v4f32 V128:$Rn))), + (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; +def : Pat<(f32 (vec_reduce_fadd (v2f32 V64:$Rn))), + (FADDPv2i32p V64:$Rn)>; +def : Pat<(f64 (vec_reduce_fadd (v2f64 V128:$Rn))), + (FADDPv2i64p V128:$Rn)>; + def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), Index: test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -49,9 +49,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: mov v0.s[3], v1.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b @@ -74,9 +73,8 @@ ; CHECK-NEXT: fadd v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b Index: test/CodeGen/AArch64/vecreduce-fadd.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd.ll +++ test/CodeGen/AArch64/vecreduce-fadd.ll @@ -11,12 +11,8 @@ define half @add_HalfH(<4 x half> %bin.rdx) { ; CHECK-LABEL: add_HalfH: -; CHECK: mov h3, v0.h[1] -; CHECK-NEXT: mov h1, v0.h[3] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h0, h0, h3 -; CHECK-NEXT: fadd h0, h0, h2 -; CHECK-NEXT: fadd h0, h0, h1 +; CHECK: faddp v0.4h, v0.4h, v0.4h +; CHECK-NEXT: faddp h0, v0.2h ; CHECK-NEXT: ret ; CHECKNOFP16-LABEL: add_HalfH: ; CHECKNOFP16-NOT: faddp @@ -30,14 +26,9 @@ define half @add_H(<8 x half> %bin.rdx) { ; CHECK-LABEL: add_H: -; CHECK: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h1, h0, h1 -; CHECK-NEXT: fadd h1, h1, h2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fadd h0, h1, h0 +; CHECK: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp h0, v0.2h ; CHECK-NEXT: ret ; CHECKNOFP16-LABEL: add_H: @@ -51,8 +42,7 @@ define float @add_S(<4 x float> %bin.rdx) { ; CHECK-LABEL: add_S: -; CHECK: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK: faddp v0.4s, v0.4s, v0.4s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float> undef, <4 x float> %bin.rdx) @@ -70,14 +60,9 @@ define half @add_2H(<16 x half> %bin.rdx) { ; CHECK-LABEL: add_2H: ; CHECK: fadd v0.8h, v0.8h, v1.8h -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: mov h1, v0.h[1] -; CHECK-NEXT: mov h2, v0.h[2] -; CHECK-NEXT: fadd h1, h0, h1 -; CHECK-NEXT: fadd h1, h1, h2 -; CHECK-NEXT: mov h0, v0.h[3] -; CHECK-NEXT: fadd h0, h1, h0 +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp h0, v0.2h ; CHECK-NEXT: ret ; CHECKNOFP16-LABEL: add_2H: ; CHECKNOFP16-NOT: faddp @@ -90,9 +75,8 @@ define float @add_2S(<8 x float> %bin.rdx) { ; CHECK-LABEL: add_2S: -; CHECK: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s +; CHECK: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s ; CHECK-NEXT: faddp s0, v0.2s ; CHECK-NEXT: ret %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float> undef, <8 x float> %bin.rdx)