Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8142,10 +8142,12 @@ SDValue Operand, SelectionDAG &DAG, int &ExtraSteps) { EVT VT = Operand.getValueType(); - if (ST->hasNEON() && - (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || - VT == MVT::f32 || VT == MVT::v1f32 || - VT == MVT::v2f32 || VT == MVT::v4f32)) { + if ((ST->hasNEON() && + (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || + VT == MVT::f32 || VT == MVT::v1f32 || + VT == MVT::v2f32 || VT == MVT::v4f32)) || + (ST->hasSVE() && + (VT == MVT::nxv2f32 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) // For the reciprocal estimates, convergence is quadratic, so the number // of digits is doubled after each iteration. In ARMv8, the accuracy of Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1896,6 +1896,32 @@ def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))), (ADDVL_XXI GPR64:$op, $imm)>; + def : Pat<(nxv2f32 (AArch64frecps (nxv2f32 ZPR:$Zs1), (nxv2f32 ZPR:$Zs2))), + (FRECPS_ZZZ_S ZPR:$Zs1, ZPR:$Zs2)>; + def : Pat<(nxv4f32 (AArch64frecps (nxv4f32 ZPR:$Zs1), (nxv4f32 ZPR:$Zs2))), + (FRECPS_ZZZ_S ZPR:$Zs1, ZPR:$Zs2)>; + def : Pat<(nxv2f64 (AArch64frecps (nxv2f64 ZPR:$Zs1), (nxv2f64 ZPR:$Zs2))), + (FRECPS_ZZZ_D ZPR:$Zs1, ZPR:$Zs2)>; + def : Pat<(nxv2f32 (AArch64frsqrts (nxv2f32 ZPR:$Zs1), (nxv2f32 ZPR:$Zs2))), + (FRSQRTS_ZZZ_S ZPR:$Zs1, ZPR:$Zs2)>; + def : Pat<(nxv4f32 (AArch64frsqrts (nxv4f32 ZPR:$Zs1), (nxv4f32 ZPR:$Zs2))), + (FRSQRTS_ZZZ_S ZPR:$Zs1, ZPR:$Zs2)>; + def : Pat<(nxv2f64 (AArch64frsqrts (nxv2f64 ZPR:$Zs1), (nxv2f64 ZPR:$Zs2))), + (FRSQRTS_ZZZ_D ZPR:$Zs1, ZPR:$Zs2)>; + + def : Pat<(nxv2f32 (AArch64frecpe (nxv2f32 ZPR:$Zs))), + (FRECPE_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv4f32 (AArch64frecpe (nxv4f32 ZPR:$Zs))), + (FRECPE_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv2f64 (AArch64frecpe (nxv2f64 ZPR:$Zs))), + (FRECPE_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2f32 (AArch64frsqrte (nxv2f32 ZPR:$Zs))), + (FRSQRTE_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv4f32 (AArch64frsqrte (nxv4f32 ZPR:$Zs))), + (FRSQRTE_ZZ_S ZPR:$Zs)>; + def : Pat<(nxv2f64 (AArch64frsqrte (nxv2f64 ZPR:$Zs))), + (FRSQRTE_ZZ_D ZPR:$Zs)>; + // FIXME: BigEndian requires an additional REV instruction to satisfy the // constraint that none of the bits change when stored to memory as one // type, and and reloaded as another type. Index: llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll @@ -0,0 +1,179 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; FDIV + +define @fdiv_2f32( %a, %b) { +; CHECK-LABEL: fdiv_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_2f32( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.s, z1.s +; CHECK-NEXT: frecps z3.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: frecps z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_4f32( %a, %b) { +; CHECK-LABEL: fdiv_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_4f32( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.s, z1.s +; CHECK-NEXT: frecps z3.s, z1.s, z2.s +; CHECK-NEXT: fmul z2.s, z2.s, z3.s +; CHECK-NEXT: frecps z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z2.s, z1.s +; CHECK-NEXT: fmul z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_f64( %a, %b) { +; CHECK-LABEL: fdiv_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_f64( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.d, z1.d +; CHECK-NEXT: frecps z3.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z2.d, z3.d +; CHECK-NEXT: frecps z3.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z2.d, z3.d +; CHECK-NEXT: frecps z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z2.d, z1.d +; CHECK-NEXT: fmul z0.d, z1.d, z0.d +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +; FSQRT + +define @fsqrt_2f32( %a, %b) { +; CHECK-LABEL: fsqrt_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f32( %a) + ret %fsqrt +} + +define @fsqrt_recip_2f32( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frsqrte z1.s, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f32( %a) + ret %fsqrt +} + +define @fsqrt_4f32( %a, %b) { +; CHECK-LABEL: fsqrt_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv4f32( %a) + ret %fsqrt +} + +define @fsqrt_recip_4f32( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frsqrte z1.s, z0.s +; CHECK-NEXT: fmul z2.s, z1.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z2.s, z1.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmul z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z0.s, z1.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv4f32( %a) + ret %fsqrt +} + +define @fsqrt_f64( %a, %b) { +; CHECK-LABEL: fsqrt_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f64( %a) + ret %fsqrt +} + +define @fsqrt_recip_f64( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: frsqrte z1.d, z0.d +; CHECK-NEXT: fmul z2.d, z1.d, z1.d +; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z1.d, z1.d +; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z1.d, z1.d +; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmul z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z0.d, z1.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f64( %a) + ret %fsqrt +} + +declare @llvm.sqrt.nxv2f32() +declare @llvm.sqrt.nxv4f32() +declare @llvm.sqrt.nxv2f64() + +attributes #0 = { "reciprocal-estimates"="all" }