Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3053,22 +3053,14 @@ Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); - uint16_t Flags = MI.getFlags(); - LLT ResTy = MRI.getType(Res); - LLT S32 = LLT::scalar(32); - LLT S64 = LLT::scalar(64); const MachineFunction &MF = B.getMF(); - bool Unsafe = - MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); + bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || + MI.getFlag(MachineInstr::FmAfn); - if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) - return false; - - if (!Unsafe && ResTy == S32 && - MF.getInfo()->getMode().allFP32Denormals()) + if (!AllowInaccurateRcp) return false; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { @@ -3095,17 +3087,13 @@ } // x / y -> x * (1.0 / y) - if (Unsafe) { - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) - .addUse(RHS) - .setMIFlags(Flags); - B.buildFMul(Res, LHS, RCP, Flags); - - MI.eraseFromParent(); - return true; - } + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(RHS) + .setMIFlags(Flags); + B.buildFMul(Res, LHS, RCP, Flags); - return false; + MI.eraseFromParent(); + return true; } bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -77,46 +77,15 @@ } define half @v_fdiv_f16_afn(half %a, half %b) { -; GFX6-IEEE-LABEL: v_fdiv_f16_afn: -; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-FLUSH-LABEL: v_fdiv_f16_afn: -; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f16_afn: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f16_afn: ; GFX8: ; %bb.0: @@ -247,39 +216,90 @@ ; GFX8-LABEL: v_rcp_f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v0, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half 1.0, %x ret half %fdiv } define half @v_rcp_f16_arcp(half %x) { -; GFX6-LABEL: v_rcp_f16_arcp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_f16_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-FLUSH-LABEL: v_rcp_f16_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_f16_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v0, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half 1.0, %x ret half %fdiv @@ -369,7 +389,35 @@ } define half @v_fdiv_f16_afn_ulp25(half %a, half %b) { -; GFX6-IEEE-LABEL: v_fdiv_f16_afn_ulp25: +; GFX6-LABEL: v_fdiv_f16_afn_ulp25: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fdiv_f16_afn_ulp25: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rcp_f16_e32 v1, v1 +; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fdiv_f16_afn_ulp25: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rcp_f16_e32 v1, v1 +; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn half %a, %b, !fpmath !0 + ret half %fdiv +} + +define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) { +; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -388,7 +436,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_fdiv_f16_afn_ulp25: +; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -410,46 +458,26 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_f16_afn_ulp25: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v1, v1 -; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_fdiv_f16_afn_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn half %a, %b, !fpmath !0 - ret half %fdiv -} - -define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) { -; GFX6-LABEL: v_fdiv_f16_arcp_ulp25: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; ; GFX8-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v1, v1 -; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v1 -; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half %a, %b, !fpmath !0 ret half %fdiv @@ -575,76 +603,20 @@ } define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) { -; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn: -; GFX6-IEEE: ; %bb.0: -; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn: -; GFX6-FLUSH: ; %bb.0: -; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 -; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_v2f16_afn: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f16_afn: ; GFX8: ; %bb.0: @@ -866,8 +838,18 @@ ; GFX8-LABEL: v_rcp_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -876,35 +858,113 @@ ; GFX9-LABEL: v_rcp_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x ret <2 x half> %fdiv } define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) { -; GFX6-LABEL: v_rcp_v2f16_arcp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f16_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_rcp_f16_e32 v0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -913,10 +973,21 @@ ; GFX9-LABEL: v_rcp_v2f16_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v1, v0 -; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> , %x ret <2 x half> %fdiv @@ -1054,7 +1125,49 @@ } define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) { -; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_rcp_f16_e32 v2, v1 +; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_rcp_f16_e32 v2, v1 +; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 +; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 + ret <2 x half> %fdiv +} + +define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { +; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -1087,7 +1200,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 @@ -1125,69 +1238,48 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v2, v1 -; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] - %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 - ret <2 x half> %fdiv -} - -define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) { -; GFX6-LABEL: v_fdiv_v2f16_arcp_ulp25: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_rcp_f32_e32 v2, v2 -; GFX6-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: s_setpc_b64 s[30:31] -; ; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f16_e32 v2, v1 -; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX8-NEXT: v_rcp_f32_e32 v5, v5 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f16_e32 v2, v1 -; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2 -; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5 +; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -250,7 +250,19 @@ ; GFX6-FLUSH-LABEL: v_rcp_f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_f32: @@ -272,7 +284,19 @@ ; GFX8-FLUSH-LABEL: v_rcp_f32: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX8-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX8-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX8-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX8-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX8-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX8-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_rcp_f32: @@ -294,30 +318,126 @@ ; GFX9-FLUSH-LABEL: v_rcp_f32: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x ret float %fdiv } define float @v_rcp_f32_arcp(float %x) { -; GFX6-LABEL: v_rcp_f32_arcp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_f32_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_rcp_f32_arcp: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX6-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_f32_arcp: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_rcp_f32_arcp: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX8-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX8-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX8-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX8-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX8-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX8-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX8-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX8-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX8-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX8-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX8-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_rcp_f32_arcp: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v1 +; GFX9-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX9-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX9-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX9-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX9-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float 1.0, %x ret float %fdiv } @@ -434,8 +554,17 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: @@ -454,8 +583,17 @@ ; GFX8-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX8-IEEE: ; %bb.0: ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX8-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX8-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX8-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX8-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX8-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: @@ -474,8 +612,17 @@ ; GFX9-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX9-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX9-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX9-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: @@ -880,8 +1027,32 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-IEEE-LABEL: v_rcp_v2f32: @@ -915,8 +1086,32 @@ ; GFX8-FLUSH-LABEL: v_rcp_v2f32: ; GFX8-FLUSH: ; %bb.0: ; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX8-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX8-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX8-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX8-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 +; GFX8-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX8-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX8-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX8-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; GFX8-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-IEEE-LABEL: v_rcp_v2f32: @@ -950,34 +1145,213 @@ ; GFX9-FLUSH-LABEL: v_rcp_v2f32: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX9-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX9-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x ret <2 x float> %fdiv } define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) { -; GFX6-LABEL: v_rcp_v2f32_arcp: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_rcp_v2f32_arcp: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, v0 -; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX6-FLUSH: ; %bb.0: +; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_v2f32_arcp: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX8-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX8-IEEE: ; %bb.0: +; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v2 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v7, v3 +; GFX8-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 +; GFX8-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 +; GFX8-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 +; GFX8-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX8-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX8-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 +; GFX8-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX8-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 +; GFX8-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 +; GFX8-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 +; GFX8-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 +; GFX8-IEEE-NEXT: s_mov_b64 vcc, s[4:5] +; GFX8-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX8-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX8-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX8-FLUSH: ; %bb.0: +; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX8-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX8-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX8-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX8-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX8-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX8-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX8-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 +; GFX8-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX8-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX8-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX8-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 +; GFX8-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX8-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; GFX8-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v7, v3 +; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 +; GFX9-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 +; GFX9-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 +; GFX9-IEEE-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX9-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX9-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 +; GFX9-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 +; GFX9-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> , %x ret <2 x float> %fdiv } @@ -1124,10 +1498,28 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: @@ -1152,10 +1544,29 @@ ; GFX8-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX8-IEEE: ; %bb.0: ; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX8-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX8-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX8-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX8-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v8, v4 +; GFX8-IEEE-NEXT: v_rcp_f32_e32 v9, v5 +; GFX8-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 +; GFX8-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 +; GFX8-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 +; GFX8-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 +; GFX8-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 +; GFX8-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 +; GFX8-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 +; GFX8-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 +; GFX8-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 +; GFX8-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 +; GFX8-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 +; GFX8-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 +; GFX8-IEEE-NEXT: s_mov_b64 vcc, s[4:5] +; GFX8-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX8-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX8-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: @@ -1180,10 +1591,29 @@ ; GFX9-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX9-IEEE: ; %bb.0: ; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX9-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX9-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX9-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v8, v4 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v9, v5 +; GFX9-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 +; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 +; GFX9-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 +; GFX9-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 +; GFX9-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 +; GFX9-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 +; GFX9-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 +; GFX9-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 +; GFX9-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 +; GFX9-IEEE-NEXT: s_mov_b64 vcc, s[4:5] +; GFX9-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX9-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -410,21 +410,12 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1] -; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] -; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] +; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v3, s5 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -438,21 +429,12 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1] -; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] -; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1] -; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3] -; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -308,21 +308,60 @@ ; SI-LABEL: name: test_fdiv_s32_denorms_off_arcp ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; SI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; SI: $vgpr0 = COPY [[FMUL]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; SI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; SI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; SI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; SI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; SI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode + ; SI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; SI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; SI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; SI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; SI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; SI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode + ; SI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; SI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; SI: $vgpr0 = COPY [[INT6]](s32) ; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; VI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; VI: $vgpr0 = COPY [[FMUL]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; VI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; VI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; VI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; VI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; VI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode + ; VI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; VI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; VI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; VI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; VI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; VI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode + ; VI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; VI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; VI: $vgpr0 = COPY [[INT6]](s32) ; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; GFX9: $vgpr0 = COPY [[FMUL]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX9: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX9: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX9: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX9: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; GFX9: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode + ; GFX9: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX9: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; GFX9: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX9: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX9: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX9: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode + ; GFX9: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX9: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX9: $vgpr0 = COPY [[INT6]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 @@ -332,9 +371,22 @@ ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; GFX10: $vgpr0 = COPY [[FMUL]](s32) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX10: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX10: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX10: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX10: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; GFX10: S_DENORM_MODE 15, implicit-def $mode, implicit $mode + ; GFX10: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX10: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; GFX10: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX10: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX10: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX10: S_DENORM_MODE 12, implicit-def $mode, implicit $mode + ; GFX10: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX10: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX10: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = arcp G_FDIV %0, %1 @@ -1898,16 +1950,28 @@ ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI: $vgpr0 = COPY [[ANYEXT]](s32) ; VI-LABEL: name: test_fdiv_s16_constant_one_rcp + ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; VI: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -1916,10 +1980,16 @@ ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp + ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00 ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half 1.0 %1:_(s32) = COPY $vgpr0 @@ -1958,18 +2028,28 @@ ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) ; SI: $vgpr0 = COPY [[ANYEXT]](s32) ; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; VI: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -1979,11 +2059,16 @@ ; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00 ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16) + ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16) + ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX10: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half -1.0 %1:_(s32) = COPY $vgpr0