Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -88,6 +88,13 @@ cl::ReallyHidden, cl::init(false)); +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt DisableFDivExpand( + "amdgpu-codegenprepare-disable-fdiv-expansion", + cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + class AMDGPUCodeGenPrepareImpl : public InstVisitor { public: @@ -834,6 +841,9 @@ // // NOTE: rcp is the preference in cases that both are legal. bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { + if (DisableFDivExpand) + return false; + Type *Ty = FDiv.getType()->getScalarType(); if (!Ty->isFloatTy()) return false; Index: llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -0,0 +1,881 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; Check for consistency of interpretation of fast math flags on fdiv +; between implementations. + +; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,CODEGEN-IEEE-SDAG %s +; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,CODEGEN-IEEE-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,IR-IEEE,IR-IEEE-SDAG %s +; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,IR-IEEE,IR-IEEE-GISEL %s + +; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,CODEGEN-DAZ,CODEGEN-DAZ-SDAG %s +; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,CODEGEN-DAZ,CODEGEN-DAZ-GISEL %s +; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,IR-DAZ,IR-DAZ-SDAG %s +; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,IR-DAZ,IR-DAZ-GISEL %s + +define float @v_fdiv_f32(float %x, float %y) { +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_f32: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_f32: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAZ-LABEL: v_fdiv_f32: +; DAZ: ; %bb.0: +; DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAZ-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; DAZ-NEXT: v_rcp_f32_e32 v3, v2 +; DAZ-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; DAZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; DAZ-NEXT: v_fma_f32 v3, v5, v3, v3 +; DAZ-NEXT: v_mul_f32_e32 v5, v4, v3 +; DAZ-NEXT: v_fma_f32 v6, -v2, v5, v4 +; DAZ-NEXT: v_fma_f32 v5, v6, v3, v5 +; DAZ-NEXT: v_fma_f32 v2, -v2, v5, v4 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; DAZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; DAZ-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; DAZ-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv float %x, %y + ret float %fdiv +} + +define float @v_fdiv_f32_afn(float %x, float %y) { +; CHECK-LABEL: v_fdiv_f32_afn: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rcp_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn float %x, %y + ret float %fdiv +} + +define float @v_fdiv_f32_arcp(float %x, float %y) { +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_arcp: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_arcp: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_f32_arcp: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_f32_arcp: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAZ-LABEL: v_fdiv_f32_arcp: +; DAZ: ; %bb.0: +; DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAZ-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; DAZ-NEXT: v_rcp_f32_e32 v3, v2 +; DAZ-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; DAZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; DAZ-NEXT: v_fma_f32 v3, v5, v3, v3 +; DAZ-NEXT: v_mul_f32_e32 v5, v4, v3 +; DAZ-NEXT: v_fma_f32 v6, -v2, v5, v4 +; DAZ-NEXT: v_fma_f32 v5, v6, v3, v5 +; DAZ-NEXT: v_fma_f32 v2, -v2, v5, v4 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; DAZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; DAZ-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; DAZ-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv arcp float %x, %y + ret float %fdiv +} + +define float @v_fdiv_f32_arcp_afn(float %x, float %y) { +; CHECK-LABEL: v_fdiv_f32_arcp_afn: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rcp_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv arcp afn float %x, %y + ret float %fdiv +} + +define float @v_fdiv_recip_f32(float %x) { +; IEEE-LABEL: v_fdiv_recip_f32: +; IEEE: ; %bb.0: +; IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IEEE-NEXT: s_setpc_b64 s[30:31] +; +; DAZ-LABEL: v_fdiv_recip_f32: +; DAZ: ; %bb.0: +; DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; DAZ-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv float 1.0, %x + ret float %fdiv +} + +define float @v_fdiv_recip_f32_afn(float %x) { +; CHECK-LABEL: v_fdiv_recip_f32_afn: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rcp_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn float 1.0, %x + ret float %fdiv +} + +define float @v_fdiv_recip_f32_arcp(float %x) { +; IEEE-LABEL: v_fdiv_recip_f32_arcp: +; IEEE: ; %bb.0: +; IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IEEE-NEXT: s_setpc_b64 s[30:31] +; +; DAZ-LABEL: v_fdiv_recip_f32_arcp: +; DAZ: ; %bb.0: +; DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; DAZ-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv arcp float 1.0, %x + ret float %fdiv +} + +define float @v_fdiv_recip_f32_arcp_afn(float %x) { +; CHECK-LABEL: v_fdiv_recip_f32_arcp_afn: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rcp_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv arcp afn float 1.0, %x + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32(float %x) { +; IEEE-LABEL: v_fdiv_recip_sqrt_f32: +; IEEE: ; %bb.0: +; IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IEEE-NEXT: s_setpc_b64 s[30:31] +; +; DAZ-LABEL: v_fdiv_recip_sqrt_f32: +; DAZ: ; %bb.0: +; DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; DAZ-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %x) + %fdiv = fdiv float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32_afn(float %x) { +; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %sqrt = call afn float @llvm.sqrt.f32(float %x) + %fdiv = fdiv afn float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32_arcp(float %x) { +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CODEGEN-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call arcp float @llvm.sqrt.f32(float %x) + %fdiv = fdiv arcp float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32_arcp_afn(float %x) { +; CHECK-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %sqrt = call arcp afn float @llvm.sqrt.f32(float %x) + %fdiv = fdiv arcp afn float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) { +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; CODEGEN-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; CODEGEN-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; CODEGEN-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; CODEGEN-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; IR-DAZ-SDAG: ; %bb.0: +; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v0, v0 +; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only: +; IR-DAZ-GISEL: ; %bb.0: +; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %x) + %fdiv = fdiv arcp float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) { +; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %x) + %fdiv = fdiv afn float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) { +; CHECK-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %x) + %fdiv = fdiv arcp afn float 1.0, %sqrt + ret float %fdiv +} + +define float @v_fdiv_f32_ulp25(float %x, float %y) { +; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 +; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; CODEGEN-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25: +; IR-IEEE-SDAG: ; %bb.0: +; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 +; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25: +; IR-IEEE-GISEL: ; %bb.0: +; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 +; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 +; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_f32_ulp25: +; CODEGEN-DAZ-SDAG: ; %bb.0: +; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-SDAG-NEXT: s_mov_b32 s4, 0x6f800000 +; CODEGEN-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; CODEGEN-DAZ-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; CODEGEN-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, v1, v2 +; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v1, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-DAZ-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 +; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_f32_ulp25: +; CODEGEN-DAZ-GISEL: ; %bb.0: +; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x6f800000 +; CODEGEN-DAZ-GISEL-NEXT: v_mov_b32_e32 v3, 0x2f800000 +; CODEGEN-DAZ-GISEL-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2 +; CODEGEN-DAZ-GISEL-NEXT: v_rcp_f32_e32 v1, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-DAZ-GISEL-NEXT: v_mul_f32_e32 v0, v2, v0 +; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-LABEL: v_fdiv_f32_ulp25: +; IR-DAZ: ; %bb.0: +; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; IR-DAZ-NEXT: v_rcp_f32_e32 v3, v2 +; IR-DAZ-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; IR-DAZ-NEXT: v_fma_f32 v3, v5, v3, v3 +; IR-DAZ-NEXT: v_mul_f32_e32 v5, v4, v3 +; IR-DAZ-NEXT: v_fma_f32 v6, -v2, v5, v4 +; IR-DAZ-NEXT: v_fma_f32 v5, v6, v3, v5 +; IR-DAZ-NEXT: v_fma_f32 v2, -v2, v5, v4 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; IR-DAZ-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; IR-DAZ-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv float %x, %y, !fpmath !0 + ret float %fdiv +} + +define float @v_fdiv_f32_afn_ulp25(float %x, float %y) { +; CHECK-LABEL: v_fdiv_f32_afn_ulp25: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rcp_f32_e32 v1, v1 +; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn float %x, %y, !fpmath !0 + ret float %fdiv +} + +define float @v_recip_f32_ulp25(float %x) { +; CODEGEN-IEEE-SDAG-LABEL: v_recip_f32_ulp25: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x6f800000 +; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_recip_f32_ulp25: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x6f800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-LABEL: v_recip_f32_ulp25: +; IR-IEEE: ; %bb.0: +; IR-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-LABEL: v_recip_f32_ulp25: +; CODEGEN-DAZ: ; %bb.0: +; CODEGEN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-LABEL: v_recip_f32_ulp25: +; IR-DAZ: ; %bb.0: +; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv float 1.0, %x, !fpmath !0 + ret float %fdiv +} + +define float @v_recip_f32_afn_ulp25(float %x) { +; CHECK-LABEL: v_recip_f32_afn_ulp25: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rcp_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn float 1.0, %x, !fpmath !0 + ret float %fdiv +} + +define float @v_recip_sqrt_f32_ulp25(float %x) { +; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25: +; CODEGEN-IEEE-SDAG: ; %bb.0: +; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x6f800000 +; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v1, 0x2f800000 +; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4 +; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25: +; CODEGEN-IEEE-GISEL: ; %bb.0: +; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x6f800000 +; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; CODEGEN-IEEE-GISEL-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; IR-IEEE-LABEL: v_recip_sqrt_f32_ulp25: +; IR-IEEE: ; %bb.0: +; IR-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; IR-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25: +; CODEGEN-DAZ: ; %bb.0: +; CODEGEN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] +; +; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25: +; IR-DAZ: ; %bb.0: +; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0 +; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1 +; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2 +; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2 +; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3 +; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4 +; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3 +; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; IR-DAZ-NEXT: s_setpc_b64 s[30:31] + %sqrt = call float @llvm.sqrt.f32(float %x), !fpmath !0 + %fdiv = fdiv float 1.0, %sqrt, !fpmath !0 + ret float %fdiv +} + +define float @v_recip_sqrt_f32_afn_ulp25(float %x) { +; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_rsq_f32_e32 v0, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %sqrt = call afn float @llvm.sqrt.f32(float %x), !fpmath !0 + %fdiv = fdiv afn float 1.0, %sqrt, !fpmath !0 + ret float %fdiv +} + +declare float @llvm.sqrt.f32(float) + +!0 = !{float 2.500000e+00}