Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -145,6 +145,7 @@ // Vector across-lanes addition // Only the lower result lane is defined. + FADDV, SADDV, UADDV, Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -710,6 +710,9 @@ MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + + if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); } setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); @@ -1175,6 +1178,7 @@ case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; + case AArch64ISD::FADDV: return "AArch64ISD::FADDV"; case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; @@ -2965,6 +2969,7 @@ case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: @@ -7858,6 +7863,8 @@ SelectionDAG &DAG) const { SDLoc dl(Op); switch (Op.getOpcode()) { + case ISD::VECREDUCE_FADD: + return getReductionSDNode(AArch64ISD::FADDV, dl, Op, DAG); case ISD::VECREDUCE_ADD: return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); case ISD::VECREDUCE_SMAX: @@ -11605,6 +11612,7 @@ ReplaceBITCASTResults(N, Results, DAG); return; case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -391,6 +391,7 @@ def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>; def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>; +def AArch64faddv : SDNode<"AArch64ISD::FADDV", SDT_AArch64UnaryVec>; def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>; @@ -4354,6 +4355,30 @@ defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; + +let Predicates = [HasFullFP16] in { +def : Pat<(v8f16 (AArch64faddv V128:$Rn)), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), + (FADDPv2i16p + (EXTRACT_SUBREG + (FADDPv8f16 (FADDPv8f16 V128:$Rn, V128:$Rn), (v8f16 (IMPLICIT_DEF))), + dsub)), + hsub)>; +def : Pat<(v4f16 (AArch64faddv V64:$Rn)), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), + (FADDPv2i16p (FADDPv4f16 V64:$Rn, V64:$Rn)), + hsub)>; +} +def : Pat<(v4f32 (AArch64faddv V128:$Rn)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), + (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub)), + ssub)>; +def : Pat<(v2f32 (AArch64faddv V64:$Rn)), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), + (FADDPv2i32p V64:$Rn), ssub)>; +def : Pat<(v2f64 (AArch64faddv V128:$Rn)), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), + (FADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), Index: test/CodeGen/AArch64/aarch64-faddv.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-faddv.ll @@ -0,0 +1,93 @@ +; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic -asm-verbose=0 -mattr=+fullfp16 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic -asm-verbose=0 | FileCheck %s --check-prefix=CHECKNOFP16 + +define float @add_HalfS(<2 x float> %bin.rdx) { +; CHECK-LABEL: add_HalfS: +; CHECK: faddp s0, v0.2s +; CHECK-NEXT: ret + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float> undef, <2 x float> %bin.rdx) + ret float %r +} + +define half @add_HalfH(<4 x half> %bin.rdx) { +; CHECK-LABEL: add_HalfH: +; CHECK: faddp v0.4h, v0.4h, v0.4h +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; CHECKNOFP16-LABEL: add_HalfH: +; CHECKNOFP16-NOT: faddp +; CHECKNOFP16: ret + %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half> undef, <4 x half> %bin.rdx) + ret half %r +} + + +define half @add_H(<8 x half> %bin.rdx) { +; CHECK-LABEL: add_H: +; CHECK: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret +; CHECKNOFP16-LABEL: add_H: +; CHECKNOFP16-NOT: faddp +; CHECKNOFP16: ret + %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half> undef, <8 x half> %bin.rdx) + ret half %r +} + +define float @add_S(<4 x float> %bin.rdx) { +; CHECK-LABEL: add_S: +; CHECK: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float> undef, <4 x float> %bin.rdx) + ret float %r +} + +define double @add_D(<2 x double> %bin.rdx) { +; CHECK-LABEL: add_D: +; CHECK: faddp d0, v0.2d +; CHECK-NEXT: ret + %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double> undef, <2 x double> %bin.rdx) + ret double %r +} + +define half @add_2H(<16 x half> %bin.rdx) { +; CHECK-LABEL: add_2H: +; CHECK: fadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp v0.8h, v0.8h, v0.8h +; CHECK-NEXT: faddp h0, v0.2h +; CHECK-NEXT: ret + %r = call fast half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half> undef, <16 x half> %bin.rdx) + ret half %r +} + +define float @add_2S(<8 x float> %bin.rdx) { +; CHECK-LABEL: add_2S: +; CHECK: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: ret + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float> undef, <8 x float> %bin.rdx) + ret float %r +} + +define double @add_2D(<4 x double> %bin.rdx) { +; CHECK-LABEL: add_2D: +; CHECK: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: faddp d0, v0.2d +; CHECK-NEXT: ret + %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double> undef, <4 x double> %bin.rdx) + ret double %r +} + +; Function Attrs: nounwind readnone +declare half @llvm.experimental.vector.reduce.fadd.f16.v4f16(<4 x half>, <4 x half>) +declare half @llvm.experimental.vector.reduce.fadd.f16.v8f16(<8 x half>, <8 x half>) +declare half @llvm.experimental.vector.reduce.fadd.f16.v16f16(<16 x half>, <16 x half>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v2f32(<2 x float>, <2 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(<4 x float>, <4 x float>) +declare float @llvm.experimental.vector.reduce.fadd.f32.v8f32(<8 x float>, <8 x float>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(<2 x double>, <2 x double>) +declare double @llvm.experimental.vector.reduce.fadd.f64.v4f64(<4 x double>, <4 x double>) Index: test/CodeGen/AArch64/vecreduce-fadd-legalization.ll =================================================================== --- test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -49,9 +49,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, wzr ; CHECK-NEXT: mov v0.s[3], v1.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v3f32(float 0.0, <3 x float> %a) ret float %b @@ -74,9 +74,9 @@ ; CHECK-NEXT: fadd v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s +; CHECK-NEXT: faddp s0, v0.2s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret %b = call fast nnan float @llvm.experimental.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a) ret float %b