Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -269,6 +269,14 @@ Value *RsqOp, const Instruction *FDiv, float ReqdAccuracy) const; + std::pair getFrexpResults(IRBuilder<> &Builder, + Value *Src) const; + + Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src, + bool IsNegative) const; + Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS, + FastMathFlags FMF) const; + public: bool visitFDiv(BinaryOperator &I); @@ -763,9 +771,30 @@ return true; } +std::pair +AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder, + Value *Src) const { + Type *Ty = Src->getType(); + Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp, + {Ty, Builder.getInt32Ty()}, Src); + Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0}); + + // Bypass the bug workaround for the exponent result since it doesn't matter. + // TODO: Does the bug workaround even really need to consider the exponent + // result? It's unspecified by the spec. + + Value *FrexpExp = + ST->hasFractBug() + ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp, + {Builder.getInt32Ty(), Ty}, Src) + : Builder.CreateExtractValue(Frexp, {1}); + return {FrexpMant, FrexpExp}; +} + /// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals. -static Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative, - bool HasFractBug) { +Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder, + Value *Src, + bool IsNegative) const { // Same as for 1.0, but expand the sign out of the constant. // -1.0 / x -> rcp (fneg x) if (IsNegative) @@ -782,25 +811,44 @@ // 0x1p+126 < den <= 0x1p+127. Type *Ty = Src->getType(); - Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp, - {Ty, Builder.getInt32Ty()}, Src); - Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0}); - - // Bypass the bug workaround for the exponent result since it doesn't matter. - // TODO: Does the bug workaround even really need to consider the exponent - // result? It's unspecified by the spec. - - Value *FrexpExp = - HasFractBug ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp, - {Builder.getInt32Ty(), Ty}, Src) - : Builder.CreateExtractValue(Frexp, {1}); + auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src); Value *ScaleFactor = Builder.CreateNeg(FrexpExp); Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, {Rcp, ScaleFactor}); } +/// Emit a 2ulp expansion for fdiv by using frexp for input scaling. +Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, + Value *RHS, + FastMathFlags FMF) const { + // If we have have to work around the fract/frexp bug, we're worse off than + // using the fdiv.fast expansion. The full safe expansion is faster if we have + // fast FMA. + if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() && + (!FMF.noNaNs() || !FMF.noInfs())) + return nullptr; + + // We're scaling the LHS to avoid a denormal input, and scale the denominator + // to avoid large values underflowing the result. + Type *Ty = LHS->getType(); + + auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS); + + Value *Rcp = + Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS); + + auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS); + Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp); + + // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the + // result. + Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS); + return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, + {Mul, ExpDiff}); +} + /// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative) { @@ -922,7 +970,7 @@ // TODO: If the input isn't denormal, and we know the input exponent isn't // big enough to introduce a denormal we can avoid the scaling. - return emitRcpIEEE1ULP(Builder, Src, IsNegative, ST->hasFractBug()); + return emitRcpIEEE1ULP(Builder, Src, IsNegative); } } @@ -936,7 +984,7 @@ return Builder.CreateFMul(Num, Recip); } - Value *Recip = emitRcpIEEE1ULP(Builder, Den, false, ST->hasFractBug()); + Value *Recip = emitRcpIEEE1ULP(Builder, Den, false); return Builder.CreateFMul(Num, Recip); } @@ -958,8 +1006,7 @@ // Only have fdiv.fast for f32. Type *Ty = Den->getType(); - if (!Ty->isFloatTy()) - return nullptr; + assert(Ty->isFloatTy()); bool NumIsOne = false; if (const ConstantFP *CNum = dyn_cast(Num)) { @@ -968,6 +1015,9 @@ } // fdiv does not support denormals. But 1.0/x is always fine to use it. + // + // TODO: This works for any value with a specific known exponent range, don't + // just limit to constant 1. if (!HasFP32DenormalFlush && !NumIsOne) return nullptr; @@ -989,7 +1039,15 @@ if (Rcp) return Rcp; - return optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy); + // In the basic case fdiv_fast has the same instruction count as the frexp div + // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can + // potentially be fused into a user. Also, materialization of the constants + // can be reused for multiple instances. + Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy); + if (FDivFast) + return FDivFast; + + return emitFrexpDiv(Builder, Num, Den, DivFMF); } // Optimizations is performed based on fpmath, fast math flags as well as Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -228,21 +228,23 @@ } define float @v_fdiv_f32_ulp25(float %a, float %b) { -; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_ulp25: -; GFX6-IEEE-FASTFMA: ; %bb.0: -; GFX6-IEEE-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_fdiv_f32_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FLUSH-LABEL: v_fdiv_f32_ulp25: ; GCN-FLUSH: ; %bb.0: @@ -257,52 +259,17 @@ ; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_ulp25: -; GFX6-IEEE-SLOWFMA: ; %bb.0: -; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-IEEE-LABEL: v_fdiv_f32_ulp25: -; GFX89-IEEE: ; %bb.0: -; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-IEEE-LABEL: v_fdiv_f32_ulp25: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-IEEE-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_f32_ulp25: @@ -319,22 +286,17 @@ ; GFX11-IEEE-LABEL: v_fdiv_f32_ulp25: ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 -; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLUSH-LABEL: v_fdiv_f32_ulp25: @@ -1240,32 +1202,35 @@ } define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { -; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_v2f32_ulp25: -; GFX6-IEEE-FASTFMA: ; %bb.0: -; GFX6-IEEE-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v5, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v7, v5, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v7, v6, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v8, -v4, v7, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v7, v8, v5, v7 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, -v4, v7, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v6, v5 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, -v5, v6, 1.0 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v4, v4, v6, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v7, -v5, v6, v2 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v5, v6, v2 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v6 -; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v1, v2, v3, v1 -; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-IEEE-LABEL: v_fdiv_v2f32_ulp25: +; GFX6-IEEE: ; %bb.0: +; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-IEEE-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v0 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v5, v0, v5, vcc +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v3 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s4 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0 +; GFX6-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v1 +; GFX6-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GFX6-IEEE-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc +; GFX6-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX6-IEEE-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1 +; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FLUSH-LABEL: v_fdiv_v2f32_ulp25: ; GCN-FLUSH: ; %bb.0: @@ -1286,88 +1251,25 @@ ; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1 ; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_v2f32_ulp25: -; GFX6-IEEE-SLOWFMA: ; %bb.0: -; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v8, v4 -; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v9, v5 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX6-IEEE-SLOWFMA-NEXT: s_mov_b64 vcc, s[4:5] -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v5, v5, v9, v11 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v1, v5, v3, v1 -; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX89-IEEE-LABEL: v_fdiv_v2f32_ulp25: -; GFX89-IEEE: ; %bb.0: -; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 -; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 -; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-IEEE-LABEL: v_fdiv_v2f32_ulp25: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v3 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v7, v1 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_sub_nc_u32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX10-IEEE-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_fdiv_v2f32_ulp25: @@ -1390,37 +1292,24 @@ ; GFX11-IEEE-LABEL: v_fdiv_v2f32_ulp25: ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v2 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v6, v0 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v7, v1 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v2, v6, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_sub_nc_u32_e32 v3, v7, v3 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 -; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v0, v4 :: v_dual_mul_f32 v1, v1, v5 +; GFX11-IEEE-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX11-IEEE-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FLUSH-LABEL: v_fdiv_v2f32_ulp25: Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -28,11 +28,38 @@ ; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP14]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul float [[TMP15]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fmul float [[TMP24]], [[TMP22]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fdiv fast float [[A]], [[B]], !fpmath !0 ; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -40,21 +67,21 @@ ; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP33]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP34]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP36]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP37]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP39]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void ; @@ -64,11 +91,38 @@ ; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul float [[TMP15]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fmul float [[TMP24]], [[TMP22]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fdiv fast float [[A]], [[B]], !fpmath !0 ; IEEE-BADFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -76,21 +130,21 @@ ; IEEE-BADFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP33]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP36]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP37]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP39]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void ; @@ -100,7 +154,16 @@ ; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; DAZ-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; DAZ-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; DAZ-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; DAZ-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; DAZ-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -112,11 +175,11 @@ ; DAZ-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; DAZ-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP1]] +; DAZ-NEXT: [[TMP10:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP10]] ; DAZ-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP2]] +; DAZ-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP11]] ; DAZ-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -144,33 +207,184 @@ } define amdgpu_kernel void @fdiv_fpmath_f32_flags(ptr addrspace(1) %out, float %a, float %b) { -; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_flags -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1:[0-9]+]] { -; IEEE-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = fdiv nnan ninf float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP_NINF_NNAN:%.*]] = fdiv nnan ninf float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[MD_25ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NINF:%.*]] = fdiv ninf float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP_NINF:%.*]] = fdiv ninf float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[MD_25ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_1ULP_NNAN:%.*]] = fdiv nnan float [[A]], [[B]], !fpmath !2 -; IEEE-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[MD_25ULP_NNAN:%.*]] = fdiv nnan float [[A]], [[B]], !fpmath !0 -; IEEE-NEXT: store volatile float [[MD_25ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_flags +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = fmul nnan ninf float [[TMP6]], [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP14]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul nnan ninf float [[TMP15]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[MD_25ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fmul ninf float [[TMP24]], [[TMP22]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = fmul ninf float [[TMP33]], [[TMP31]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[MD_25ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = extractvalue { float, i32 } [[TMP37]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractvalue { float, i32 } [[TMP37]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP38]]) +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP41]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP41]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = fmul nnan float [[TMP42]], [[TMP40]] +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = sub i32 [[TMP43]], [[TMP39]] +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP45]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = extractvalue { float, i32 } [[TMP46]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = extractvalue { float, i32 } [[TMP46]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP47]]) +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = extractvalue { float, i32 } [[TMP50]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = extractvalue { float, i32 } [[TMP50]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = fmul nnan float [[TMP51]], [[TMP49]] +; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = sub i32 [[TMP52]], [[TMP48]] +; IEEE-GOODFREXP-NEXT: [[MD_25ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP53]], i32 [[TMP54]]) +; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_flags +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = fmul nnan ninf float [[TMP6]], [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul nnan ninf float [[TMP15]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[MD_25ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fmul ninf float [[TMP24]], [[TMP22]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = fmul ninf float [[TMP33]], [[TMP31]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] +; IEEE-BADFREXP-NEXT: [[MD_25ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = extractvalue { float, i32 } [[TMP37]], 0 +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP41]], 0 +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = fmul nnan float [[TMP42]], [[TMP40]] +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = sub i32 [[TMP43]], [[TMP39]] +; IEEE-BADFREXP-NEXT: [[MD_1ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP44]], i32 [[TMP45]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = extractvalue { float, i32 } [[TMP46]], 0 +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP47]]) +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = extractvalue { float, i32 } [[TMP50]], 0 +; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = fmul nnan float [[TMP51]], [[TMP49]] +; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = sub i32 [[TMP52]], [[TMP48]] +; IEEE-BADFREXP-NEXT: [[MD_25ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP53]], i32 [[TMP54]]) +; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_flags ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]]) #[[ATTR1]] { -; DAZ-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = fdiv nnan ninf float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; DAZ-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; DAZ-NEXT: [[TMP4:%.*]] = call nnan ninf float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; DAZ-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; DAZ-NEXT: [[TMP8:%.*]] = fmul nnan ninf float [[TMP6]], [[TMP4]] +; DAZ-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; DAZ-NEXT: [[MD_1ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; DAZ-NEXT: store volatile float [[MD_1ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_25ULP_NINF_NNAN:%.*]] = call nnan ninf float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[MD_25ULP_NINF_NNAN]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[MD_1ULP_NINF:%.*]] = fdiv ninf float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; DAZ-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; DAZ-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; DAZ-NEXT: [[TMP13:%.*]] = call ninf float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; DAZ-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; DAZ-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP14]], 1 +; DAZ-NEXT: [[TMP17:%.*]] = fmul ninf float [[TMP15]], [[TMP13]] +; DAZ-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; DAZ-NEXT: [[MD_1ULP_NINF:%.*]] = call ninf float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; DAZ-NEXT: store volatile float [[MD_1ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_25ULP_NINF:%.*]] = call ninf float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[MD_25ULP_NINF]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[MD_1ULP_NNAN:%.*]] = fdiv nnan float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; DAZ-NEXT: [[TMP22:%.*]] = call nnan float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; DAZ-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; DAZ-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; DAZ-NEXT: [[TMP26:%.*]] = fmul nnan float [[TMP24]], [[TMP22]] +; DAZ-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; DAZ-NEXT: [[MD_1ULP_NNAN:%.*]] = call nnan float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; DAZ-NEXT: store volatile float [[MD_1ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_25ULP_NNAN:%.*]] = call nnan float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[MD_25ULP_NNAN]], ptr addrspace(1) [[OUT]], align 4 @@ -961,31 +1175,129 @@ } define amdgpu_kernel void @fdiv_fpmath_f32_vector(ptr addrspace(1) %out, <2 x float> %a, <2 x float> %b) { -; IEEE-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_vector -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> [[A]], [[B]] -; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: [[MD_HALF_ULP:%.*]] = fdiv <2 x float> [[A]], [[B]], !fpmath !1 -; IEEE-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 -; IEEE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fdiv float [[TMP1]], [[TMP3]], !fpmath !2 -; IEEE-NEXT: [[TMP6:%.*]] = fdiv float [[TMP2]], [[TMP4]], !fpmath !2 -; IEEE-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 -; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 -; IEEE-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[A]], i64 1 -; IEEE-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[B]], i64 0 -; IEEE-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[B]], i64 1 -; IEEE-NEXT: [[TMP12:%.*]] = fdiv float [[TMP8]], [[TMP10]], !fpmath !0 -; IEEE-NEXT: [[TMP13:%.*]] = fdiv float [[TMP9]], [[TMP11]], !fpmath !0 -; IEEE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; IEEE-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_vector +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> [[A]], [[B]] +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv <2 x float> [[A]], [[B]], !fpmath !1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[B]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[B]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP9]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP9]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = fmul float [[TMP10]], [[TMP8]] +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = sub i32 [[TMP11]], [[TMP7]] +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = fmul float [[TMP20]], [[TMP18]] +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[TMP17]] +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[A]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[A]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[B]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[B]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP28]]) +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP30]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP30]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP34]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = fmul float [[TMP35]], [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = sub i32 [[TMP36]], [[TMP32]] +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP37]], i32 [[TMP38]]) +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP41]]) +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP27]]) +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP44]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = extractvalue { float, i32 } [[TMP44]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = fmul float [[TMP45]], [[TMP43]] +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = sub i32 [[TMP46]], [[TMP42]] +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP47]], i32 [[TMP48]]) +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = insertelement <2 x float> poison, float [[TMP39]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP50]], float [[TMP49]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_vector +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv <2 x float> [[A]], [[B]] +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv <2 x float> [[A]], [[B]], !fpmath !1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[A]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[B]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[B]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP9]], 0 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = fmul float [[TMP10]], [[TMP8]] +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = sub i32 [[TMP11]], [[TMP7]] +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = fmul float [[TMP20]], [[TMP18]] +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[TMP17]] +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[A]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[A]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[B]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[B]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP30]], 0 +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP28]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = fmul float [[TMP35]], [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = sub i32 [[TMP36]], [[TMP32]] +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP37]], i32 [[TMP38]]) +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0 +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP41]]) +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP44]], 0 +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP27]]) +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = fmul float [[TMP45]], [[TMP43]] +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = sub i32 [[TMP46]], [[TMP42]] +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP47]], i32 [[TMP48]]) +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = insertelement <2 x float> poison, float [[TMP39]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP50]], float [[TMP49]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @fdiv_fpmath_f32_vector ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]) #[[ATTR1]] { @@ -997,19 +1309,37 @@ ; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[A]], i64 1 ; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[B]], i64 0 ; DAZ-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = fdiv float [[TMP1]], [[TMP3]], !fpmath !2 -; DAZ-NEXT: [[TMP6:%.*]] = fdiv float [[TMP2]], [[TMP4]], !fpmath !2 -; DAZ-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i64 0 -; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP6]], i64 1 +; DAZ-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; DAZ-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; DAZ-NEXT: [[TMP8:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP9:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]]) +; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP9]], 0 +; DAZ-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP9]], 1 +; DAZ-NEXT: [[TMP12:%.*]] = fmul float [[TMP10]], [[TMP8]] +; DAZ-NEXT: [[TMP13:%.*]] = sub i32 [[TMP11]], [[TMP7]] +; DAZ-NEXT: [[TMP14:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP12]], i32 [[TMP13]]) +; DAZ-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0 +; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1 +; DAZ-NEXT: [[TMP18:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP16]]) +; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP2]]) +; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; DAZ-NEXT: [[TMP22:%.*]] = fmul float [[TMP20]], [[TMP18]] +; DAZ-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[TMP17]] +; DAZ-NEXT: [[TMP24:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP22]], i32 [[TMP23]]) +; DAZ-NEXT: [[TMP25:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i64 0 +; DAZ-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP25]], float [[TMP24]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 8 -; DAZ-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[A]], i64 0 -; DAZ-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[A]], i64 1 -; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[B]], i64 0 -; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[B]], i64 1 -; DAZ-NEXT: [[TMP12:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP8]], float [[TMP10]]) -; DAZ-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP9]], float [[TMP11]]) -; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; DAZ-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 +; DAZ-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[A]], i64 0 +; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[A]], i64 1 +; DAZ-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[B]], i64 0 +; DAZ-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[B]], i64 1 +; DAZ-NEXT: [[TMP30:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP26]], float [[TMP28]]) +; DAZ-NEXT: [[TMP31:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[TMP27]], float [[TMP29]]) +; DAZ-NEXT: [[TMP32:%.*]] = insertelement <2 x float> poison, float [[TMP30]], i64 0 +; DAZ-NEXT: [[MD_25ULP:%.*]] = insertelement <2 x float> [[TMP32]], float [[TMP31]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 8 ; DAZ-NEXT: ret void ; @@ -1777,67 +2107,147 @@ } define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x float> %x) { -; IEEE-LABEL: define amdgpu_kernel void @rsq_f32_vector_fpmath -; IEEE-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) -; IEEE-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> , [[SQRT_X_NO_MD]] -; IEEE-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 -; IEEE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP5:%.*]] = fcmp contract olt float [[TMP3]], 0x3810000000000000 -; IEEE-NEXT: [[TMP6:%.*]] = select contract i1 [[TMP5]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP7:%.*]] = fmul contract float [[TMP3]], [[TMP6]] -; IEEE-NEXT: [[TMP8:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP7]]) -; IEEE-NEXT: [[TMP9:%.*]] = select contract i1 [[TMP5]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP8]], [[TMP9]] -; IEEE-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[TMP4]], 0x3810000000000000 -; IEEE-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP13:%.*]] = fmul contract float [[TMP4]], [[TMP12]] -; IEEE-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) -; IEEE-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP16:%.*]] = fmul contract float [[TMP14]], [[TMP15]] -; IEEE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 -; IEEE-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 -; IEEE-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 -; IEEE-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 -; IEEE-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP20]], 0x3810000000000000 -; IEEE-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP20]], [[TMP23]] -; IEEE-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) -; IEEE-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] -; IEEE-NEXT: [[TMP28:%.*]] = fdiv contract float undef, [[TMP19]], !fpmath !2 -; IEEE-NEXT: [[TMP29:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 -; IEEE-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP29]], float [[TMP28]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; IEEE-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; IEEE-NEXT: [[TMP31:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; IEEE-NEXT: [[TMP32:%.*]] = extractelement <2 x float> [[X]], i64 0 -; IEEE-NEXT: [[TMP33:%.*]] = extractelement <2 x float> [[X]], i64 1 -; IEEE-NEXT: [[TMP34:%.*]] = fcmp contract olt float [[TMP32]], 0x3810000000000000 -; IEEE-NEXT: [[TMP35:%.*]] = select contract i1 [[TMP34]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP36:%.*]] = fmul contract float [[TMP32]], [[TMP35]] -; IEEE-NEXT: [[TMP37:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP36]]) -; IEEE-NEXT: [[TMP38:%.*]] = select contract i1 [[TMP34]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP39:%.*]] = fmul contract float [[TMP37]], [[TMP38]] -; IEEE-NEXT: [[TMP40:%.*]] = fcmp contract olt float [[TMP33]], 0x3810000000000000 -; IEEE-NEXT: [[TMP41:%.*]] = select contract i1 [[TMP40]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP42:%.*]] = fmul contract float [[TMP33]], [[TMP41]] -; IEEE-NEXT: [[TMP43:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]]) -; IEEE-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP40]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP43]], [[TMP44]] -; IEEE-NEXT: [[TMP46:%.*]] = insertelement <2 x float> poison, float [[TMP39]], i64 0 -; IEEE-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP46]], float [[TMP45]], i64 1 -; IEEE-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-NEXT: ret void +; IEEE-GOODFREXP-LABEL: define amdgpu_kernel void @rsq_f32_vector_fpmath +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) +; IEEE-GOODFREXP-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> , [[SQRT_X_NO_MD]] +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = fcmp contract olt float [[TMP3]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = select contract i1 [[TMP5]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = fmul contract float [[TMP3]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP7]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = select contract i1 [[TMP5]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP8]], [[TMP9]] +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[TMP4]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = fmul contract float [[TMP4]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = fmul contract float [[TMP14]], [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP20]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP20]], [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 +; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = fcmp contract olt float [[TMP41]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP43]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP41]], [[TMP44]] +; IEEE-GOODFREXP-NEXT: [[TMP46:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP45]]) +; IEEE-GOODFREXP-NEXT: [[TMP47:%.*]] = select contract i1 [[TMP43]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = fmul contract float [[TMP46]], [[TMP47]] +; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = fcmp contract olt float [[TMP42]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = select contract i1 [[TMP49]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = fmul contract float [[TMP42]], [[TMP50]] +; IEEE-GOODFREXP-NEXT: [[TMP52:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP51]]) +; IEEE-GOODFREXP-NEXT: [[TMP53:%.*]] = select contract i1 [[TMP49]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP54:%.*]] = fmul contract float [[TMP52]], [[TMP53]] +; IEEE-GOODFREXP-NEXT: [[TMP55:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0 +; IEEE-GOODFREXP-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP55]], float [[TMP54]], i64 1 +; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-GOODFREXP-NEXT: ret void +; +; IEEE-BADFREXP-LABEL: define amdgpu_kernel void @rsq_f32_vector_fpmath +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]) +; IEEE-BADFREXP-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> , [[SQRT_X_NO_MD]] +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = fcmp contract olt float [[TMP3]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = select contract i1 [[TMP5]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = fmul contract float [[TMP3]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP7]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = select contract i1 [[TMP5]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP8]], [[TMP9]] +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fcmp contract olt float [[TMP4]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = select contract i1 [[TMP11]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = fmul contract float [[TMP4]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = select contract i1 [[TMP11]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = fmul contract float [[TMP14]], [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = insertelement <2 x float> [[TMP17]], float [[TMP16]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = fcmp contract olt float [[TMP20]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = select contract i1 [[TMP22]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fmul contract float [[TMP20]], [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP24]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = select contract i1 [[TMP22]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = fmul contract float [[TMP25]], [[TMP26]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0 +; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[X]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = fcmp contract olt float [[TMP41]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP44:%.*]] = select contract i1 [[TMP43]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP45:%.*]] = fmul contract float [[TMP41]], [[TMP44]] +; IEEE-BADFREXP-NEXT: [[TMP46:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP45]]) +; IEEE-BADFREXP-NEXT: [[TMP47:%.*]] = select contract i1 [[TMP43]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = fmul contract float [[TMP46]], [[TMP47]] +; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = fcmp contract olt float [[TMP42]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = select contract i1 [[TMP49]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP51:%.*]] = fmul contract float [[TMP42]], [[TMP50]] +; IEEE-BADFREXP-NEXT: [[TMP52:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP51]]) +; IEEE-BADFREXP-NEXT: [[TMP53:%.*]] = select contract i1 [[TMP49]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP54:%.*]] = fmul contract float [[TMP52]], [[TMP53]] +; IEEE-BADFREXP-NEXT: [[TMP55:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0 +; IEEE-BADFREXP-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP55]], float [[TMP54]], i64 1 +; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 +; IEEE-BADFREXP-NEXT: ret void ; ; DAZ-LABEL: define amdgpu_kernel void @rsq_f32_vector_fpmath ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { @@ -1860,19 +2270,28 @@ ; DAZ-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[X]], i64 0 ; DAZ-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[X]], i64 1 ; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP10]]) -; DAZ-NEXT: [[TMP13:%.*]] = fdiv contract float undef, [[TMP9]], !fpmath !2 -; DAZ-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 -; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP13]], i64 1 +; DAZ-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP9]]) +; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; DAZ-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; DAZ-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 +; DAZ-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP17]], 1 +; DAZ-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP16]] +; DAZ-NEXT: [[TMP21:%.*]] = sub i32 [[TMP19]], [[TMP15]] +; DAZ-NEXT: [[TMP22:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP21]]) +; DAZ-NEXT: [[TMP23:%.*]] = insertelement <2 x float> poison, float [[TMP12]], i64 0 +; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP23]], float [[TMP22]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3 -; DAZ-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 -; DAZ-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 -; DAZ-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[X]], i64 0 -; DAZ-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[X]], i64 1 -; DAZ-NEXT: [[TMP19:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; DAZ-NEXT: [[TMP20:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP18]]) -; DAZ-NEXT: [[TMP21:%.*]] = insertelement <2 x float> poison, float [[TMP19]], i64 0 -; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP21]], float [[TMP20]], i64 1 +; DAZ-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0 +; DAZ-NEXT: [[TMP25:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1 +; DAZ-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[X]], i64 0 +; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[X]], i64 1 +; DAZ-NEXT: [[TMP28:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP26]]) +; DAZ-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP27]]) +; DAZ-NEXT: [[TMP30:%.*]] = insertelement <2 x float> poison, float [[TMP28]], i64 0 +; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = insertelement <2 x float> [[TMP30]], float [[TMP29]], i64 1 ; DAZ-NEXT: store volatile <2 x float> [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -2603,36 +3022,103 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) { -; IEEE-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator -; IEEE-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; IEEE-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 -; IEEE-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; IEEE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; IEEE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; IEEE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; IEEE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; IEEE-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; IEEE-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; IEEE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; IEEE-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 -; IEEE-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP10]] -; IEEE-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) -; IEEE-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 -; IEEE-NEXT: [[TMP14:%.*]] = fmul contract float [[TMP12]], [[TMP13]] -; IEEE-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 -; IEEE-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 -; IEEE-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP6]], [[TMP16]] -; IEEE-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) -; IEEE-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 -; IEEE-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] -; IEEE-NEXT: [[TMP21:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 -; IEEE-NEXT: [[TMP22:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 -; IEEE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 -; IEEE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i64 1 -; IEEE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP21]], i64 2 -; IEEE-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP22]], i64 3 -; IEEE-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator +; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = fmul contract float [[TMP12]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP6]], [[TMP16]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP21]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP25]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[TMP26]], [[TMP24]] +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP23]] +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP31]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = extractvalue { float, i32 } [[TMP35]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]] +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]] +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]]) +; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP20]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP30]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP43]], float [[TMP40]], i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator +; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = fmul contract float [[TMP12]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP6]], [[TMP16]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = fmul contract float [[TMP18]], [[TMP19]] +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP21]], 0 +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP22]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = extractvalue { float, i32 } [[TMP25]], 0 +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[TMP26]], [[TMP24]] +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP23]] +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP28]], i32 [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = extractvalue { float, i32 } [[TMP31]], 0 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP32]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP35]], 0 +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = fmul contract float [[TMP36]], [[TMP34]] +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = sub i32 [[TMP37]], [[TMP33]] +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP39]]) +; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = insertelement <4 x float> poison, float [[TMP14]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP20]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP30]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP43]], float [[TMP40]], i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator ; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { @@ -2648,12 +3134,30 @@ ; DAZ-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP5]]) ; DAZ-NEXT: [[TMP10:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]]) ; DAZ-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP10]] -; DAZ-NEXT: [[TMP12:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 -; DAZ-NEXT: [[TMP13:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 -; DAZ-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 -; DAZ-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 1 -; DAZ-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i64 2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i64 3 +; DAZ-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; DAZ-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 +; DAZ-NEXT: [[TMP19:%.*]] = fmul contract float [[TMP17]], [[TMP15]] +; DAZ-NEXT: [[TMP20:%.*]] = sub i32 [[TMP18]], [[TMP14]] +; DAZ-NEXT: [[TMP21:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP19]], i32 [[TMP20]]) +; DAZ-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; DAZ-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; DAZ-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; DAZ-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 +; DAZ-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1 +; DAZ-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]] +; DAZ-NEXT: [[TMP30:%.*]] = sub i32 [[TMP28]], [[TMP24]] +; DAZ-NEXT: [[TMP31:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP30]]) +; DAZ-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; DAZ-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP11]], i64 1 +; DAZ-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP21]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg), !fpmath !2 @@ -2662,27 +3166,125 @@ } define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt(<4 x float> %arg) { -; CHECK-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt -; CHECK-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 -; CHECK-NEXT: [[TMP9:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP5]]) -; CHECK-NEXT: [[TMP10:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP6]]) -; CHECK-NEXT: [[TMP11:%.*]] = fneg contract afn float [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 -; CHECK-NEXT: [[TMP13:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP11]], i64 1 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP12]], i64 2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP13]], i64 3 -; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt +; IEEE-GOODFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP5]]) +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg contract afn float [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fmul contract float [[TMP17]], [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = sub i32 [[TMP18]], [[TMP14]] +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP19]], i32 [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]] +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = sub i32 [[TMP28]], [[TMP24]] +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP30]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP11]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP21]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt +; IEEE-BADFREXP-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP5]]) +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg contract afn float [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fmul contract float [[TMP17]], [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = sub i32 [[TMP18]], [[TMP14]] +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP19]], i32 [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]] +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = sub i32 [[TMP28]], [[TMP24]] +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP30]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP11]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP21]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_afn_sqrt +; DAZ-SAME: (<4 x float> [[ARG:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[DENOM:%.*]] = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> [[ARG]]) +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[DENOM]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0 +; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1 +; DAZ-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2 +; DAZ-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3 +; DAZ-NEXT: [[TMP9:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP5]]) +; DAZ-NEXT: [[TMP10:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[TMP6]]) +; DAZ-NEXT: [[TMP11:%.*]] = fneg contract afn float [[TMP10]] +; DAZ-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; DAZ-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) +; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0 +; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1 +; DAZ-NEXT: [[TMP19:%.*]] = fmul contract float [[TMP17]], [[TMP15]] +; DAZ-NEXT: [[TMP20:%.*]] = sub i32 [[TMP18]], [[TMP14]] +; DAZ-NEXT: [[TMP21:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP19]], i32 [[TMP20]]) +; DAZ-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; DAZ-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; DAZ-NEXT: [[TMP25:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP23]]) +; DAZ-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0 +; DAZ-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1 +; DAZ-NEXT: [[TMP29:%.*]] = fmul contract float [[TMP27]], [[TMP25]] +; DAZ-NEXT: [[TMP30:%.*]] = sub i32 [[TMP28]], [[TMP24]] +; DAZ-NEXT: [[TMP31:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP29]], i32 [[TMP30]]) +; DAZ-NEXT: [[TMP32:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i64 0 +; DAZ-NEXT: [[TMP33:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP11]], i64 1 +; DAZ-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP21]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP31]], i64 3 +; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg) %partial.rsq = fdiv contract <4 x float> , %denom, !fpmath !2 @@ -2734,12 +3336,30 @@ ; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] ; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) ; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 -; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 2 -; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = extractvalue { float, i32 } [[TMP32]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]] +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP17]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP27]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt @@ -2762,12 +3382,30 @@ ; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]] ; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]]) ; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]]) -; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 -; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP17]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP18]], i64 2 -; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = extractvalue { float, i32 } [[TMP32]], 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = fmul contract float [[TMP33]], [[TMP31]] +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = sub i32 [[TMP34]], [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP35]], i32 [[TMP36]]) +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <4 x float> poison, float [[TMP10]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP17]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = insertelement <4 x float> [[TMP39]], float [[TMP27]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP37]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; ; DAZ-LABEL: define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt @@ -2780,12 +3418,30 @@ ; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]]) ; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP2]] ; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]]) -; DAZ-NEXT: [[TMP8:%.*]] = fdiv contract float 4.000000e+00, [[TMP3]], !fpmath !2 -; DAZ-NEXT: [[TMP9:%.*]] = fdiv contract float undef, [[TMP4]], !fpmath !2 -; DAZ-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; DAZ-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 1 -; DAZ-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 2 -; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP9]], i64 3 +; DAZ-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; DAZ-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; DAZ-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 4.000000e+00) +; DAZ-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; DAZ-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] +; DAZ-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] +; DAZ-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) +; DAZ-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; DAZ-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; DAZ-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; DAZ-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; DAZ-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] +; DAZ-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] +; DAZ-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) +; DAZ-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 +; DAZ-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 ; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %denom = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> %arg) @@ -3035,23 +3691,113 @@ ; Make sure we don't crash if a vector square root has a constant vecctor input define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float> %x) { -; CHECK-LABEL: define <4 x float> @rsq_f32_vector_const_denom -; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) -; CHECK-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) -; CHECK-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fdiv contract float undef, [[TMP3]], !fpmath !2 -; CHECK-NEXT: [[TMP9:%.*]] = fdiv contract float 2.000000e+00, [[TMP4]], !fpmath !2 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i64 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP8]], i64 2 -; CHECK-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP9]], i64 3 -; CHECK-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; IEEE-GOODFREXP-LABEL: define <4 x float> @rsq_f32_vector_const_denom +; IEEE-GOODFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-GOODFREXP-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 +; IEEE-GOODFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 +; IEEE-GOODFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; IEEE-BADFREXP-LABEL: define <4 x float> @rsq_f32_vector_const_denom +; IEEE-BADFREXP-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; IEEE-BADFREXP-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP4]]) +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 +; IEEE-BADFREXP-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 +; IEEE-BADFREXP-NEXT: ret <4 x float> [[PARTIAL_RSQ]] +; +; DAZ-LABEL: define <4 x float> @rsq_f32_vector_const_denom +; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { +; DAZ-NEXT: [[SQRT:%.*]] = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[SQRT]], i64 0 +; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[SQRT]], i64 1 +; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SQRT]], i64 2 +; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SQRT]], i64 3 +; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00) +; DAZ-NEXT: [[TMP6:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00) +; DAZ-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP6]] +; DAZ-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]]) +; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0 +; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1 +; DAZ-NEXT: [[TMP11:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP9]]) +; DAZ-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0 +; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1 +; DAZ-NEXT: [[TMP15:%.*]] = fmul contract float [[TMP13]], [[TMP11]] +; DAZ-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP10]] +; DAZ-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP15]], i32 [[TMP16]]) +; DAZ-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP4]]) +; DAZ-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0 +; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1 +; DAZ-NEXT: [[TMP21:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP19]]) +; DAZ-NEXT: [[TMP22:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; DAZ-NEXT: [[TMP23:%.*]] = extractvalue { float, i32 } [[TMP22]], 0 +; DAZ-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP22]], 1 +; DAZ-NEXT: [[TMP25:%.*]] = fmul contract float [[TMP23]], [[TMP21]] +; DAZ-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP20]] +; DAZ-NEXT: [[TMP27:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP25]], i32 [[TMP26]]) +; DAZ-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; DAZ-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP7]], i64 1 +; DAZ-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP17]], i64 2 +; DAZ-NEXT: [[PARTIAL_RSQ:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i64 3 +; DAZ-NEXT: ret <4 x float> [[PARTIAL_RSQ]] ; %sqrt = call contract <4 x float> @llvm.sqrt.v4f32(<4 x float> ), !fpmath !2 %partial.rsq = fdiv contract <4 x float> , %sqrt, !fpmath !2 @@ -3073,10 +3819,30 @@ ; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] ; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]]) ; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 -; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP12]], i64 1 -; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float 0x7FF8000000000000, i64 2 -; IEEE-GOODFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP15]], float 0x3FC99999A0000000, i64 3 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 3.200000e+01) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP17]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = fmul float [[TMP18]], [[TMP16]] +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = sub i32 [[TMP19]], [[TMP15]] +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP21]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 1.000000e+01) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP24]]) +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP27]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = fmul float [[TMP28]], [[TMP26]] +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP25]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP12]], i64 1 +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP22]], i64 2 +; IEEE-GOODFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP32]], i64 3 ; IEEE-GOODFREXP-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] ; ; IEEE-BADFREXP-LABEL: define <4 x float> @fdiv_constant_f32_vector @@ -3093,20 +3859,60 @@ ; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] ; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP8]]) ; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 -; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP12]], i64 1 -; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float 0x7FF8000000000000, i64 2 -; IEEE-BADFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP15]], float 0x3FC99999A0000000, i64 3 +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 3.200000e+01) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 3.200000e+01) +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float undef) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = fmul float [[TMP18]], [[TMP16]] +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = sub i32 [[TMP19]], [[TMP15]] +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP21]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 1.000000e+01) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 1.000000e+01) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP24]]) +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP27]], 0 +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float 2.000000e+00) +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = fmul float [[TMP28]], [[TMP26]] +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP25]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i64 0 +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP12]], i64 1 +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = insertelement <4 x float> [[TMP34]], float [[TMP22]], i64 2 +; IEEE-BADFREXP-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP35]], float [[TMP32]], i64 3 ; IEEE-BADFREXP-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] ; ; DAZ-LABEL: define <4 x float> @fdiv_constant_f32_vector ; DAZ-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] { ; DAZ-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.rcp.f32(float 5.000000e-01) ; DAZ-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.rcp.f32(float -2.000000e+00) -; DAZ-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; DAZ-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP2]], i64 1 -; DAZ-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0x7FF8000000000000, i64 2 -; DAZ-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP5]], float 0x3FC99999A0000000, i64 3 +; DAZ-NEXT: [[TMP3:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 3.200000e+01) +; DAZ-NEXT: [[TMP4:%.*]] = extractvalue { float, i32 } [[TMP3]], 0 +; DAZ-NEXT: [[TMP5:%.*]] = extractvalue { float, i32 } [[TMP3]], 1 +; DAZ-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP4]]) +; DAZ-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float undef) +; DAZ-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 +; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 +; DAZ-NEXT: [[TMP10:%.*]] = fmul float [[TMP8]], [[TMP6]] +; DAZ-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP5]] +; DAZ-NEXT: [[TMP12:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP10]], i32 [[TMP11]]) +; DAZ-NEXT: [[TMP13:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 1.000000e+01) +; DAZ-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP13]], 0 +; DAZ-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP13]], 1 +; DAZ-NEXT: [[TMP16:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP14]]) +; DAZ-NEXT: [[TMP17:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float 2.000000e+00) +; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP17]], 0 +; DAZ-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP17]], 1 +; DAZ-NEXT: [[TMP20:%.*]] = fmul float [[TMP18]], [[TMP16]] +; DAZ-NEXT: [[TMP21:%.*]] = sub i32 [[TMP19]], [[TMP15]] +; DAZ-NEXT: [[TMP22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP20]], i32 [[TMP21]]) +; DAZ-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 +; DAZ-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP2]], i64 1 +; DAZ-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP12]], i64 2 +; DAZ-NEXT: [[CONST_PARTIAL_RCP:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP22]], i64 3 ; DAZ-NEXT: ret <4 x float> [[CONST_PARTIAL_RCP]] ; %const.partial.rcp = fdiv <4 x float> , , !fpmath !2 @@ -3120,11 +3926,38 @@ ; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP14]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul float [[TMP15]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fmul float [[TMP24]], [[TMP22]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fdiv fast float [[A]], [[B]], !fpmath !0 ; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -3132,21 +3965,21 @@ ; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP33]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP34]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP36]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP37]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP39]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void ; @@ -3156,11 +3989,38 @@ ; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul float [[TMP15]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fmul float [[TMP24]], [[TMP22]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fdiv fast float [[A]], [[B]], !fpmath !0 ; IEEE-BADFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -3168,21 +4028,21 @@ ; IEEE-BADFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP33]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP36]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP37]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP39]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void ; @@ -3192,7 +4052,16 @@ ; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; DAZ-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; DAZ-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; DAZ-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; DAZ-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; DAZ-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -3204,11 +4073,11 @@ ; DAZ-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; DAZ-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP1]] +; DAZ-NEXT: [[TMP10:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP10]] ; DAZ-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP2]] +; DAZ-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP11]] ; DAZ-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; @@ -3242,11 +4111,38 @@ ; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-GOODFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = extractvalue { float, i32 } [[TMP10]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP14]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul float [[TMP15]], [[TMP13]] +; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-GOODFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fmul float [[TMP24]], [[TMP22]] +; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-GOODFREXP-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fdiv fast float [[A]], [[B]], !fpmath !0 ; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -3254,21 +4150,21 @@ ; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; IEEE-GOODFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP6]] +; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP28]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-GOODFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-GOODFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP33]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP7]], 1 -; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP12]] +; IEEE-GOODFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-GOODFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-GOODFREXP-NEXT: [[TMP36:%.*]] = extractvalue { float, i32 } [[TMP34]], 1 +; IEEE-GOODFREXP-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP36]] +; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP35]]) +; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP37]]) +; IEEE-GOODFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP39]] ; IEEE-GOODFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: ret void ; @@ -3278,11 +4174,38 @@ ; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; IEEE-BADFREXP-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; IEEE-BADFREXP-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !0 +; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractvalue { float, i32 } [[TMP10]], 0 +; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP11]]) +; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = extractvalue { float, i32 } [[TMP14]], 0 +; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul float [[TMP15]], [[TMP13]] +; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = sub i32 [[TMP16]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[MD_25ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP17]], i32 [[TMP18]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !3 +; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0 +; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP20]]) +; IEEE-BADFREXP-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0 +; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[A]]) +; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fmul float [[TMP24]], [[TMP22]] +; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = sub i32 [[TMP25]], [[TMP21]] +; IEEE-BADFREXP-NEXT: [[MD_3ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP26]], i32 [[TMP27]]) ; IEEE-BADFREXP-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[FAST_MD_25ULP:%.*]] = fdiv fast float [[A]], [[B]], !fpmath !0 ; IEEE-BADFREXP-NEXT: store volatile float [[FAST_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -3290,21 +4213,21 @@ ; IEEE-BADFREXP-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; IEEE-BADFREXP-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 -; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = sub i32 0, [[TMP3]] -; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP2]]) -; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP5]], i32 [[TMP4]]) -; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP6]] +; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = extractvalue { float, i32 } [[TMP28]], 0 +; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = sub i32 0, [[TMP30]] +; IEEE-BADFREXP-NEXT: [[TMP32:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP29]]) +; IEEE-BADFREXP-NEXT: [[TMP33:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP32]], i32 [[TMP31]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP33]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractvalue { float, i32 } [[TMP7]], 0 -; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) -; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = sub i32 0, [[TMP9]] -; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP8]]) -; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP11]], i32 [[TMP10]]) -; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP12]] +; IEEE-BADFREXP-NEXT: [[TMP34:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP35:%.*]] = extractvalue { float, i32 } [[TMP34]], 0 +; IEEE-BADFREXP-NEXT: [[TMP36:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[B]]) +; IEEE-BADFREXP-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP36]] +; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[TMP35]]) +; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = call arcp float @llvm.ldexp.f32.i32(float [[TMP38]], i32 [[TMP37]]) +; IEEE-BADFREXP-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP39]] ; IEEE-BADFREXP-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: ret void ; @@ -3314,7 +4237,16 @@ ; DAZ-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_HALF_ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !1 ; DAZ-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[MD_1ULP:%.*]] = fdiv float [[A]], [[B]], !fpmath !2 +; DAZ-NEXT: [[TMP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[B]]) +; DAZ-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP1]], 0 +; DAZ-NEXT: [[TMP3:%.*]] = extractvalue { float, i32 } [[TMP1]], 1 +; DAZ-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[TMP2]]) +; DAZ-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[A]]) +; DAZ-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0 +; DAZ-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1 +; DAZ-NEXT: [[TMP8:%.*]] = fmul float [[TMP6]], [[TMP4]] +; DAZ-NEXT: [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP3]] +; DAZ-NEXT: [[MD_1ULP:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP8]], i32 [[TMP9]]) ; DAZ-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[MD_25ULP:%.*]] = call float @llvm.amdgcn.fdiv.fast(float [[A]], float [[B]]) ; DAZ-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 @@ -3326,11 +4258,11 @@ ; DAZ-NEXT: store volatile float [[AFN_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[NO_MD_ARCP:%.*]] = fdiv arcp float [[A]], [[B]] ; DAZ-NEXT: store volatile float [[NO_MD_ARCP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP1:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP1]] +; DAZ-NEXT: [[TMP10:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_25ULP:%.*]] = fmul arcp float [[A]], [[TMP10]] ; DAZ-NEXT: store volatile float [[ARCP_MD_25ULP]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[TMP2:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) -; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP2]] +; DAZ-NEXT: [[TMP11:%.*]] = call arcp float @llvm.amdgcn.rcp.f32(float [[B]]) +; DAZ-NEXT: [[ARCP_MD_1ULP:%.*]] = fmul arcp float [[A]], [[TMP11]] ; DAZ-NEXT: store volatile float [[ARCP_MD_1ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: ret void ; Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -418,53 +418,31 @@ ; Use correct fdiv define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a, float %b) #1 { -; GFX6-FASTFMA-LABEL: s_fdiv_25ulp_ieee_f32: -; GFX6-FASTFMA: ; %bb.0: ; %entry -; GFX6-FASTFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-FASTFMA-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-FASTFMA-NEXT: s_mov_b32 s6, -1 -; GFX6-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FASTFMA-NEXT: s_mov_b32 s4, s0 -; GFX6-FASTFMA-NEXT: s_mov_b32 s5, s1 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, s3, v0 -; GFX6-FASTFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6-FASTFMA-NEXT: s_endpgm -; -; GFX6-SLOWFMA-LABEL: s_fdiv_25ulp_ieee_f32: -; GFX6-SLOWFMA: ; %bb.0: ; %entry -; GFX6-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, -1 -; GFX6-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX6-SLOWFMA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s4, s0 -; GFX6-SLOWFMA-NEXT: s_mov_b32 s5, s1 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s3, v0 -; GFX6-SLOWFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6-SLOWFMA-NEXT: s_endpgm +; GFX6-LABEL: s_fdiv_25ulp_ieee_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: v_mov_b32_e32 v0, 0x7f800000 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_frexp_mant_f32_e32 v1, s3 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |s3|, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |s2|, v0 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v2, s3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX7: ; %bb.0: ; %entry @@ -472,21 +450,16 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_div_scale_f32 v3, vcc, s2, v3, s2 -; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, s3 +; GFX7-NEXT: v_rcp_f32_e32 v0, v0 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GFX7-NEXT: v_frexp_mant_f32_e32 v3, s2 +; GFX7-NEXT: v_mul_f32_e32 v0, v3, v0 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v2, v1 ; GFX7-NEXT: s_mov_b32 s4, s0 ; GFX7-NEXT: s_mov_b32 s5, s1 -; GFX7-NEXT: v_div_fixup_f32 v0, v1, s3, v0 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -494,19 +467,14 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 -; GFX8-NEXT: v_rcp_f32_e32 v3, v1 -; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v2, v1, s3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v0, s3 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 +; GFX8-NEXT: v_frexp_mant_f32_e32 v3, s2 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v2, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -516,39 +484,33 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_rcp_f32_e32 v1, v0 -; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 -; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 -; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 -; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 -; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX10-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 -; GFX11-NEXT: v_rcp_f32_e32 v1, v0 +; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, s2 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 -; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 -; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 -; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 -; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2673,101 +2635,75 @@ } define float @v_fdiv_f32_ieee_25ulp(float %x, float %y) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_ieee_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_ieee_25ulp: @@ -2897,7 +2833,87 @@ } define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; EG-LABEL: v_fdiv_f32_dynamic_25ulp: +; EG: ; %bb.0: +; EG-NEXT: CF_END +; EG-NEXT: PAD + %div = fdiv float %x, %y, !fpmath !0 + ret float %div +} + +define float @v_fdiv_f32_daz(float %x, float %y) #0 { +; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz: ; GFX6-FASTFMA: ; %bb.0: ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -2915,7 +2931,7 @@ ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz: ; GFX6-SLOWFMA: ; %bb.0: ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -2933,7 +2949,7 @@ ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX7-LABEL: v_fdiv_f32_daz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -2951,7 +2967,7 @@ ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX8-LABEL: v_fdiv_f32_daz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -2969,7 +2985,7 @@ ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX10-LABEL: v_fdiv_f32_daz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 @@ -2987,7 +3003,7 @@ ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp: +; GFX11-LABEL: v_fdiv_f32_daz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 @@ -3006,145 +3022,27 @@ ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_fdiv_f32_dynamic_25ulp: +; EG-LABEL: v_fdiv_f32_daz: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD - %div = fdiv float %x, %y, !fpmath !0 + %div = fdiv float %x, %y ret float %div } -define float @v_fdiv_f32_daz(float %x, float %y) #0 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_daz: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_daz: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: v_fdiv_f32_daz: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: v_fdiv_f32_daz: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_f32_daz: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_f32_daz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_denorm_mode 15 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; EG-LABEL: v_fdiv_f32_daz: -; EG: ; %bb.0: -; EG-NEXT: CF_END -; EG-NEXT: PAD - %div = fdiv float %x, %y - ret float %div -} - -define float @v_fdiv_f32_daz_25ulp(float %x, float %y) #0 { -; GFX678-LABEL: v_fdiv_f32_daz_25ulp: -; GFX678: ; %bb.0: -; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX678-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX678-NEXT: v_mov_b32_e32 v2, 0x2f800000 -; GFX678-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 -; GFX678-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc -; GFX678-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX678-NEXT: v_rcp_f32_e32 v1, v1 -; GFX678-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX678-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX678-NEXT: s_setpc_b64 s[30:31] +define float @v_fdiv_f32_daz_25ulp(float %x, float %y) #0 { +; GFX678-LABEL: v_fdiv_f32_daz_25ulp: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 s4, 0x6f800000 +; GFX678-NEXT: v_mov_b32_e32 v2, 0x2f800000 +; GFX678-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4 +; GFX678-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc +; GFX678-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX678-NEXT: v_rcp_f32_e32 v1, v1 +; GFX678-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX678-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX678-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_daz_25ulp: ; GFX10: ; %bb.0: @@ -3292,106 +3190,79 @@ } define float @v_fdiv_f32_ieee_25ulp_contractable_user(float %x, float %y, float %z) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX7-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX7-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX7-NEXT: v_rcp_f32_e32 v3, v3 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX8-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX8-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX10-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_ldexp_f32 v0, v3, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp_contractable_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX11-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_ldexp_f32 v0, v3, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3530,118 +3401,79 @@ } define float @v_fdiv_f32_dynamic_25ulp_contractable_user(float %x, float %y, float %z) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX7-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX7-NEXT: v_rcp_f32_e32 v3, v3 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX8-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX8-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX10-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_ldexp_f32 v0, v3, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_contractable_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX11-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_ldexp_f32 v0, v3, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -3932,231 +3764,87 @@ } define float @v_fdiv_f32_ieee_25ulp__nnan_ninf(float %x, float %y, float %z) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; EG-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: -; EG: ; %bb.0: -; EG-NEXT: CF_END -; EG-NEXT: PAD - %div = fdiv nnan ninf float %x, %y, !fpmath !0 - ret float %div -} - -define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_f32_dynamic__nnan_ninf: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf: +; EG-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf: ; EG: ; %bb.0: ; EG-NEXT: CF_END ; EG-NEXT: PAD - %div = fdiv nnan ninf float %x, %y + %div = fdiv nnan ninf float %x, %y, !fpmath !0 ret float %div } -define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 { +; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; GFX6-FASTFMA: ; %bb.0: ; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -4174,7 +3862,7 @@ ; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] ; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; GFX6-SLOWFMA: ; %bb.0: ; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -4192,7 +3880,7 @@ ; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX7-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -4210,7 +3898,7 @@ ; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX8-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 @@ -4228,7 +3916,7 @@ ; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX10-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 @@ -4246,7 +3934,7 @@ ; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX11-LABEL: v_fdiv_f32_dynamic__nnan_ninf: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 @@ -4265,6 +3953,86 @@ ; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; +; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf: +; EG: ; %bb.0: +; EG-NEXT: CF_END +; EG-NEXT: PAD + %div = fdiv nnan ninf float %x, %y + ret float %div +} + +define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z) #2 { +; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; EG-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -4550,106 +4318,79 @@ } define float @v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user(float %x, float %y, float %z) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX7-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX7-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX7-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX7-NEXT: v_rcp_f32_e32 v3, v3 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX8-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX8-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX10-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_ldexp_f32 v0, v3, v0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp__nnan_ninf_contractable_user: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v4 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX11-NEXT: v_div_fixup_f32 v0, v3, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_ldexp_f32 v0, v3, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4778,131 +4519,92 @@ ; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user: -; EG: ; %bb.0: -; EG-NEXT: CF_END -; EG-NEXT: PAD - %div = fdiv nnan ninf contract float %x, %y - %add = fadd contract float %div, %z - ret float %add -} - -define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, float %y, float %z) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-FASTFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX6-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: -; GFX7: ; %bb.0: -; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v4, v3 -; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX7-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX7-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v6, -v3, v5, 1.0 -; GFX8-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX8-NEXT: v_mul_f32_e32 v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v7, -v3, v6, v4 -; GFX8-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX8-NEXT: v_fma_f32 v3, -v3, v6, v4 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v3, v3, v5, v6 -; GFX8-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v3, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX10-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v3, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v4, v3 -; GFX11-NEXT: s_denorm_mode 15 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 -; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX11-NEXT: v_div_fixup_f32 v0, v3, v1, v0 -; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; +; EG-LABEL: v_fdiv_f32_dynamic__nnan_ninf_contractable_user: +; EG: ; %bb.0: +; EG-NEXT: CF_END +; EG-NEXT: PAD + %div = fdiv nnan ninf contract float %x, %y + %add = fadd contract float %div, %z + ret float %add +} + +define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, float %y, float %z) #2 { +; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v3 +; GFX6-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v3, v0 +; GFX6-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX7-NEXT: v_rcp_f32_e32 v3, v3 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX8-NEXT: v_rcp_f32_e32 v3, v3 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v4, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX10-NEXT: v_ldexp_f32 v0, v3, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_ldexp_f32 v0, v3, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; EG-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user: ; EG: ; %bb.0: ; EG-NEXT: CF_END @@ -5191,101 +4893,75 @@ } define float @v_fdiv_neglhs_f32_ieee_25ulp(float %x, float %y) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, -v0, v1, -v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e64 v3, -v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v3, -v0, v3, s[4:5] +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e64 v0, -v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, -v0, v1, -v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e64 v0, -v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, -v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e64 v3, -v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, -v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e64 v3, -v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_neglhs_f32_ieee_25ulp: @@ -5417,113 +5093,75 @@ } define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, -v0, v1, -v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e64 v3, -v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v3, -v0, v3, s[4:5] +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e64 v0, -v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, -v0, v1, -v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e64 v0, -v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, -v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e64 v3, -v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, -v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, -v0, v1, -v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e64 v3, -v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, -v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp: @@ -5808,101 +5446,75 @@ } define float @v_fdiv_negrhs_f32_ieee_25ulp(float %x, float %y) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, -v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; GFX6-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, -v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, -v1, -v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, -v1, -v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_negrhs_f32_ieee_25ulp: @@ -6034,113 +5646,75 @@ } define float @v_fdiv_negrhs_f32_dynamic_25ulp(float %x, float %y) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, -v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s6, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6 +; GFX6-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5] +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, -v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, -v1, -v1, v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, -v1, -v1, v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, -v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: v_frexp_mant_f32_e64 v2, -v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, -v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_negrhs_f32_dynamic_25ulp: @@ -6428,105 +6002,57 @@ } define float @v_fdiv_f32_constrhs0_ieee_25ulp(float %x) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v3, v2, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, 0x4640e400 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x3fa9e0f0, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 -; GFX7-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 -; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX7-NEXT: v_div_fixup_f32 v0, v1, s6, v0 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x3fa9e0f0, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, -14, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 -; GFX8-NEXT: v_rcp_f32_e32 v3, v1 -; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v0, v1, s6, v0 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fa9e0f0, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -14, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fa9e0f0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, -14, v0 +; GFX10-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fa9e0f0, v1 :: v_dual_add_nc_u32 v0, -14, v0 +; GFX11-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_constrhs0_ieee_25ulp: @@ -6660,117 +6186,57 @@ } define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, s6, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, 0x4640e400 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, 0x3fa9e0f0, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX7-NEXT: v_div_fixup_f32 v0, v1, s6, v0 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, 0x3fa9e0f0, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, -14, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, s6, v0 -; GFX8-NEXT: v_rcp_f32_e32 v3, v1 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v0, v1, s6, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v1, s4, 0x4640e400, 0x4640e400, v0 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x3fa9e0f0, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -14, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x3fa9e0f0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, -14, v0 +; GFX10-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v1, null, 0x4640e400, 0x4640e400, v0 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, 0x4640e400, v0 -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 -; GFX11-NEXT: s_denorm_mode 15 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, 0x4640e400, v0 +; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_dual_mul_f32 v1, 0x3fa9e0f0, v1 :: v_dual_add_nc_u32 v0, -14, v0 +; GFX11-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: @@ -7029,105 +6495,64 @@ } define float @v_fdiv_f32_constlhs0_ieee_25ulp(float %x) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v3, v2, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v2, 0x4640e400 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 -; GFX7-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 -; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, s6 +; GFX7-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX7-NEXT: v_rcp_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, 14, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 -; GFX8-NEXT: v_rcp_f32_e32 v3, v1 -; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, s6 +; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 14, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x4640e400 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 14, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 14, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_constlhs0_ieee_25ulp: @@ -7261,117 +6686,64 @@ } define float @v_fdiv_f32_constlhs0_dynamic_25ulp(float %x) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v3, v1 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v1, v0, s6 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GFX6-NEXT: v_rcp_f32_e32 v1, v1 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v2, 0x4640e400 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v1, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX7-NEXT: v_rcp_f32_e32 v2, v1 -; GFX7-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 -; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, s6 +; GFX7-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX7-NEXT: v_rcp_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX7-NEXT: v_sub_i32_e32 v0, vcc, 14, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, 0x4640e400 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 -; GFX8-NEXT: v_rcp_f32_e32 v3, v1 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX8-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v0, v1, v0, s6 +; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX8-NEXT: v_rcp_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 14, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x4640e400 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 -; GFX10-NEXT: v_rcp_f32_e32 v2, v1 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 14, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX10-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x4640e400 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x4640e400, v0, 0x4640e400 -; GFX11-NEXT: v_rcp_f32_e32 v2, v1 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 14, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 -; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x4640e400 +; GFX11-NEXT: v_mul_f32_e32 v1, 0x3f40e400, v1 +; GFX11-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_constlhs0_dynamic_25ulp: @@ -7656,101 +7028,75 @@ } define float @v_fdiv_f32_ieee_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_x: @@ -7880,113 +7226,75 @@ } define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float %y) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x: @@ -8267,101 +7575,75 @@ } define float @v_fdiv_f32_ieee_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y) #1 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_ieee_25ulp_nodenorm_y: @@ -8491,113 +7773,75 @@ } define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub) %y) #2 { -; GFX6-FASTFMA-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: -; GFX6-FASTFMA: ; %bb.0: -; GFX6-FASTFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-FASTFMA-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-SLOWFMA-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: -; GFX6-SLOWFMA: ; %bb.0: -; GFX6-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX6-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX6-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX6-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX6-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX6-SLOWFMA-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX6-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX6-NEXT: v_rcp_f32_e32 v2, v2 +; GFX6-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX6-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc +; GFX6-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_ldexp_f32_e32 v0, v2, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX7-NEXT: v_rcp_f32_e32 v3, v2 -; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX7-NEXT: v_rcp_f32_e32 v2, v2 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX8-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX8-NEXT: v_rcp_f32_e32 v4, v2 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX8-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX8-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX8-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX8-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX8-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX8-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX8-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX10-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX10-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 -; GFX11-NEXT: v_rcp_f32_e32 v3, v2 -; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v1 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v0 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX11-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX11-NEXT: s_denorm_mode 12 -; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_ldexp_f32 v0, v2, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y: @@ -8776,5 +8020,3 @@ attributes #2 = { "denormal-fp-math-f32"="dynamic,dynamic" } !0 = !{float 2.500000e+00} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX6: {{.*}} Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -337,45 +337,34 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 2.0 -; GCN-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, -2.0 -; GCN-DENORM-NEXT: v_div_scale_f32 v2, vcc, 2.0, s4, 2.0 -; GCN-DENORM-NEXT: v_div_scale_f32 v3, s[0:1], -2.0, s7, -2.0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v4, v0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v1 -; GCN-DENORM-NEXT: v_fma_f32 v6, -v0, v4, 1.0 -; GCN-DENORM-NEXT: v_fma_f32 v4, v6, v4, v4 -; GCN-DENORM-NEXT: v_fma_f32 v7, -v1, v5, 1.0 -; GCN-DENORM-NEXT: v_fma_f32 v5, v7, v5, v5 -; GCN-DENORM-NEXT: v_mul_f32_e32 v6, v2, v4 -; GCN-DENORM-NEXT: v_mul_f32_e32 v7, v3, v5 -; GCN-DENORM-NEXT: v_fma_f32 v8, -v0, v6, v2 -; GCN-DENORM-NEXT: v_fma_f32 v9, -v1, v7, v3 -; GCN-DENORM-NEXT: v_fma_f32 v6, v8, v4, v6 -; GCN-DENORM-NEXT: v_fma_f32 v7, v9, v5, v7 -; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v6, v2 -; GCN-DENORM-NEXT: v_fma_f32 v1, -v1, v7, v3 -; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v4, v6 -; GCN-DENORM-NEXT: s_mov_b64 vcc, s[0:1] -; GCN-DENORM-NEXT: v_div_fmas_f32 v3, v1, v5, v7 -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s5 -; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v5, -s6 +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s1 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v3, s1 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v5, -s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v3, 0, v3 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s5 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s6 -; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 -; GCN-DENORM-NEXT: v_sub_u32_e32 v6, 0, v6 -; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 -; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 -; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v6 -; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 2.0 -; GCN-DENORM-NEXT: v_div_fixup_f32 v3, v3, s7, -2.0 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GCN-DENORM-NEXT: v_mul_f32_e32 v7, 0.5, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v2, s3 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v6 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v0, s0 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v0, 2, v0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, -0.5, v3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 2, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v7, v0 +; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_c_by_x_25ulp: @@ -412,45 +401,34 @@ define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(ptr addrspace(1) %arg) { ; GCN-DENORM-LABEL: div_v4_c_by_minus_x_25ulp: ; GCN-DENORM: ; %bb.0: -; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-DENORM-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -2.0 -; GCN-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], -s7, -s7, -2.0 -; GCN-DENORM-NEXT: v_div_scale_f32 v2, vcc, -2.0, s4, -2.0 -; GCN-DENORM-NEXT: v_div_scale_f32 v3, s[0:1], -2.0, -s7, -2.0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v4, v0 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v1 -; GCN-DENORM-NEXT: v_fma_f32 v6, -v0, v4, 1.0 -; GCN-DENORM-NEXT: v_fma_f32 v4, v6, v4, v4 -; GCN-DENORM-NEXT: v_fma_f32 v7, -v1, v5, 1.0 -; GCN-DENORM-NEXT: v_fma_f32 v5, v7, v5, v5 -; GCN-DENORM-NEXT: v_mul_f32_e32 v6, v2, v4 -; GCN-DENORM-NEXT: v_mul_f32_e32 v7, v3, v5 -; GCN-DENORM-NEXT: v_fma_f32 v8, -v0, v6, v2 -; GCN-DENORM-NEXT: v_fma_f32 v9, -v1, v7, v3 -; GCN-DENORM-NEXT: v_fma_f32 v6, v8, v4, v6 -; GCN-DENORM-NEXT: v_fma_f32 v7, v9, v5, v7 -; GCN-DENORM-NEXT: v_fma_f32 v0, -v0, v6, v2 -; GCN-DENORM-NEXT: v_fma_f32 v1, -v1, v7, v3 -; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v4, v6 -; GCN-DENORM-NEXT: s_mov_b64 vcc, s[0:1] -; GCN-DENORM-NEXT: v_div_fmas_f32 v3, v1, v5, v7 -; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v1, -s5 -; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v5, s6 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v1, -s0 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s1 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v3, s1 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v5, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v3, 0, v3 ; GCN-DENORM-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s5 -; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s6 -; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2 -; GCN-DENORM-NEXT: v_sub_u32_e32 v6, 0, v6 -; GCN-DENORM-NEXT: v_mov_b32_e32 v4, 0 -; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 -; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v6 -; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -2.0 -; GCN-DENORM-NEXT: v_div_fixup_f32 v3, v3, -s7, -2.0 -; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GCN-DENORM-NEXT: v_mul_f32_e32 v7, 0.5, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v2, v3 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v2, -s3 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v6, s2 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v6 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v0, s0 +; GCN-DENORM-NEXT: v_ldexp_f32 v2, v5, v2 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v5, s3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v0, 2, v0 +; GCN-DENORM-NEXT: v_mul_f32_e32 v3, -0.5, v3 +; GCN-DENORM-NEXT: v_sub_u32_e32 v5, 2, v5 +; GCN-DENORM-NEXT: v_ldexp_f32 v0, v7, v0 +; GCN-DENORM-NEXT: v_ldexp_f32 v3, v3, v5 +; GCN-DENORM-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v4_c_by_minus_x_25ulp: @@ -492,24 +470,19 @@ ; GCN-DENORM: ; %bb.0: ; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s4 +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v3, s4 ; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_mov_b32_e32 v0, s4 -; GCN-DENORM-NEXT: s_load_dword s5, s[2:3], 0x0 -; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GCN-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], s5, s5, v0 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, s5 -; GCN-DENORM-NEXT: v_div_scale_f32 v2, vcc, s4, v2, s4 -; GCN-DENORM-NEXT: v_rcp_f32_e32 v3, v1 -; GCN-DENORM-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GCN-DENORM-NEXT: v_fma_f32 v3, v4, v3, v3 -; GCN-DENORM-NEXT: v_mul_f32_e32 v4, v2, v3 -; GCN-DENORM-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GCN-DENORM-NEXT: v_fma_f32 v4, v5, v3, v4 -; GCN-DENORM-NEXT: v_fma_f32 v1, -v1, v4, v2 -; GCN-DENORM-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GCN-DENORM-NEXT: v_mov_b32_e32 v2, 0 -; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v1, s5, v0 -; GCN-DENORM-NEXT: global_store_dword v2, v0, s[2:3] +; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v1, s0 +; GCN-DENORM-NEXT: v_rcp_f32_e32 v1, v1 +; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v4, s0 +; GCN-DENORM-NEXT: v_sub_u32_e32 v2, v2, v4 +; GCN-DENORM-NEXT: v_mul_f32_e32 v1, v3, v1 +; GCN-DENORM-NEXT: v_ldexp_f32 v1, v1, v2 +; GCN-DENORM-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-DENORM-NEXT: s_endpgm ; ; GCN-FLUSH-LABEL: div_v_by_x_25ulp: Index: llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -593,33 +593,27 @@ ; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25: ; CODEGEN-IEEE-SDAG: ; %bb.0: ; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v3, v2 -; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v3, v4, v3, v3 -; CODEGEN-IEEE-SDAG-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v5, v4, v3 -; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v6, -v2, v5, v4 -; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v5, v6, v3, v5 -; CODEGEN-IEEE-SDAG-NEXT: v_fma_f32 v2, -v2, v5, v4 -; CODEGEN-IEEE-SDAG-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; CODEGEN-IEEE-SDAG-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v3, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_frexp_mant_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 +; CODEGEN-IEEE-SDAG-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CODEGEN-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25: ; CODEGEN-IEEE-GISEL: ; %bb.0: ; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v3, v2 -; CODEGEN-IEEE-GISEL-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 -; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v3, v5, v3, v3 -; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v5, v4, v3 -; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v6, -v2, v5, v4 -; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v5, v6, v3, v5 -; CODEGEN-IEEE-GISEL-NEXT: v_fma_f32 v2, -v2, v5, v4 -; CODEGEN-IEEE-GISEL-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; CODEGEN-IEEE-GISEL-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_frexp_mant_f32_e32 v2, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_frexp_mant_f32_e32 v3, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 +; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, v3, v2 +; CODEGEN-IEEE-GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CODEGEN-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v2, v0 ; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; IR-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25: