Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4130,6 +4130,18 @@ case Intrinsic::aarch64_sve_frecpx: return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_frecpe_x: + return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_frecps_x: + return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_frsqrte_x: + return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_frsqrts_x: + return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_sve_fabs: return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); @@ -8235,10 +8247,13 @@ SDValue Operand, SelectionDAG &DAG, int &ExtraSteps) { EVT VT = Operand.getValueType(); - if (ST->hasNEON() && - (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || - VT == MVT::f32 || VT == MVT::v1f32 || - VT == MVT::v2f32 || VT == MVT::v4f32)) { + if ((ST->hasNEON() && + (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || + VT == MVT::f32 || VT == MVT::v1f32 || + VT == MVT::v2f32 || VT == MVT::v4f32)) || + (ST->hasSVE() && + (VT == MVT::nxv2f16 || VT == MVT::nxv4f16 || VT == MVT::nxv8f16 || + VT == MVT::nxv2f32 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) { if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) // For the reciprocal estimates, convergence is quadratic, so the number // of digits is doubled after each iteration. In ARMv8, the accuracy of Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -402,8 +402,8 @@ defm SMIN_ZPZZ : sve_int_bin_pred_bhsd; defm UMIN_ZPZZ : sve_int_bin_pred_bhsd; - defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>; - defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>; + defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", AArch64frecpe>; + defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", AArch64frsqrte>; defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", "FADD_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fadd>; defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", "FSUB_ZPZI", sve_fpimm_half_one, fpimm_half, fpimm_one, int_aarch64_sve_fsub>; @@ -484,8 +484,8 @@ } // End HasSVE let Predicates = [HasSVEorStreamingSVE] in { - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", AArch64frecps>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", AArch64frsqrts>; } // End HasSVEorStreamingSVE let Predicates = [HasSVE] in { Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -1932,7 +1932,10 @@ def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _H)>; def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; def : SVE_2_Op_Pat(NAME # _S)>; def : SVE_2_Op_Pat(NAME # _D)>; @@ -2631,7 +2634,10 @@ def _S : sve_fp_2op_u_zd<0b10, opc, asm, ZPR32>; def _D : sve_fp_2op_u_zd<0b11, opc, asm, ZPR64>; + def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _H)>; def : SVE_1_Op_Pat(NAME # _H)>; + def : SVE_1_Op_Pat(NAME # _S)>; def : SVE_1_Op_Pat(NAME # _S)>; def : SVE_1_Op_Pat(NAME # _D)>; } Index: llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll @@ -0,0 +1,347 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; FDIV + +define @fdiv_2f16( %a, %b) { +; CHECK-LABEL: fdiv_2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_2f16( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.h, z1.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frecps z3.h, z1.h, z2.h +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: frecps z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_4f16( %a, %b) { +; CHECK-LABEL: fdiv_4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_4f16( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.h, z1.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frecps z3.h, z1.h, z2.h +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z3.h +; CHECK-NEXT: frecps z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_8f16( %a, %b) { +; CHECK-LABEL: fdiv_8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_8f16( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.h, z1.h +; CHECK-NEXT: frecps z3.h, z1.h, z2.h +; CHECK-NEXT: fmul z2.h, z2.h, z3.h +; CHECK-NEXT: frecps z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z2.h, z1.h +; CHECK-NEXT: fmul z0.h, z1.h, z0.h +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_2f32( %a, %b) { +; CHECK-LABEL: fdiv_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_2f32( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.s, z1.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frecps z3.s, z1.s, z2.s +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: frecps z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_4f32( %a, %b) { +; CHECK-LABEL: fdiv_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_4f32( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.s, z1.s +; CHECK-NEXT: frecps z3.s, z1.s, z2.s +; CHECK-NEXT: fmul z2.s, z2.s, z3.s +; CHECK-NEXT: frecps z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z2.s, z1.s +; CHECK-NEXT: fmul z0.s, z1.s, z0.s +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_2f64( %a, %b) { +; CHECK-LABEL: fdiv_2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +define @fdiv_recip_2f64( %a, %b) #0 { +; CHECK-LABEL: fdiv_recip_2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: frecpe z2.d, z1.d +; CHECK-NEXT: frecps z3.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z2.d, z3.d +; CHECK-NEXT: frecps z3.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z2.d, z3.d +; CHECK-NEXT: frecps z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z2.d, z1.d +; CHECK-NEXT: fmul z0.d, z1.d, z0.d +; CHECK-NEXT: ret + %fdiv = fdiv fast %a, %b + ret %fdiv +} + +; FSQRT + +define @fsqrt_2f16( %a, %b) { +; CHECK-LABEL: fsqrt_2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f16( %a) + ret %fsqrt +} + +define @fsqrt_recip_2f16( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frsqrte z1.h, z0.h +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f16( %a) + ret %fsqrt +} + +define @fsqrt_4f16( %a, %b) { +; CHECK-LABEL: fsqrt_4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv4f16( %a) + ret %fsqrt +} + +define @fsqrt_recip_4f16( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frsqrte z1.h, z0.h +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, p0/m, z1.h, z0.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv4f16( %a) + ret %fsqrt +} + +define @fsqrt_8f16( %a, %b) { +; CHECK-LABEL: fsqrt_8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv8f16( %a) + ret %fsqrt +} + +define @fsqrt_recip_8f16( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: frsqrte z1.h, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: fmul z2.h, z1.h, z1.h +; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z2.h, z1.h, z1.h +; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h +; CHECK-NEXT: fmul z1.h, z1.h, z2.h +; CHECK-NEXT: fmul z1.h, z0.h, z1.h +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv8f16( %a) + ret %fsqrt +} + +define @fsqrt_2f32( %a, %b) { +; CHECK-LABEL: fsqrt_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f32( %a) + ret %fsqrt +} + +define @fsqrt_recip_2f32( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frsqrte z1.s, z0.s +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: movprfx z2, z1 +; CHECK-NEXT: fmul z2.s, p0/m, z2.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z0.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f32( %a) + ret %fsqrt +} + +define @fsqrt_4f32( %a, %b) { +; CHECK-LABEL: fsqrt_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv4f32( %a) + ret %fsqrt +} + +define @fsqrt_recip_4f32( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: frsqrte z1.s, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fmul z2.s, z1.s, z1.s +; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z2.s, z1.s, z1.s +; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s +; CHECK-NEXT: fmul z1.s, z1.s, z2.s +; CHECK-NEXT: fmul z1.s, z0.s, z1.s +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv4f32( %a) + ret %fsqrt +} + +define @fsqrt_2f64( %a, %b) { +; CHECK-LABEL: fsqrt_2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f64( %a) + ret %fsqrt +} + +define @fsqrt_recip_2f64( %a, %b) #0 { +; CHECK-LABEL: fsqrt_recip_2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: frsqrte z1.d, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fmul z2.d, z1.d, z1.d +; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z1.d, z1.d +; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z2.d, z1.d, z1.d +; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z1.d, z1.d, z2.d +; CHECK-NEXT: fmul z1.d, z0.d, z1.d +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %fsqrt = call fast @llvm.sqrt.nxv2f64( %a) + ret %fsqrt +} + +declare @llvm.sqrt.nxv2f16() +declare @llvm.sqrt.nxv4f16() +declare @llvm.sqrt.nxv8f16() +declare @llvm.sqrt.nxv2f32() +declare @llvm.sqrt.nxv4f32() +declare @llvm.sqrt.nxv2f64() + +attributes #0 = { "reciprocal-estimates"="all" }