Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -914,8 +914,7 @@ DivFMF |= SqrtFMF; Builder.setFastMathFlags(DivFMF); - // FIXME: Can't interpret approxFunc as ignore denormal here - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath || canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) @@ -1079,6 +1078,21 @@ const FastMathFlags DivFMF = FPOp->getFastMathFlags(); const float ReqdAccuracy = FPOp->getFPAccuracy(); + FastMathFlags SqrtFMF; + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *RsqOp = nullptr; + auto *DenII = dyn_cast(Den); + if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && + DenII->hasOneUse()) { + const auto *SqrtOp = cast(DenII); + SqrtFMF = SqrtOp->getFastMathFlags(); + if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) + RsqOp = SqrtOp->getOperand(0); + } + // Inaccurate rcp is allowed with unsafe-fp-math or afn. // // Defer to codegen to handle this. @@ -1089,28 +1103,13 @@ // don't need any pre-consideration here when we have better information. A // more conservative interpretation could use handling here. const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); - if (AllowInaccurateRcp) + if (!RsqOp && AllowInaccurateRcp) return false; // Defer the correct implementations to codegen. if (ReqdAccuracy < 1.0f) return false; - FastMathFlags SqrtFMF; - - Value *Num = FDiv.getOperand(0); - Value *Den = FDiv.getOperand(1); - - Value *RsqOp = nullptr; - auto *DenII = dyn_cast(Den); - if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && - DenII->hasOneUse()) { - const auto *SqrtOp = cast(DenII); - SqrtFMF = SqrtOp->getFastMathFlags(); - if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) - RsqOp = SqrtOp->getOperand(0); - } - IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); Builder.setFastMathFlags(DivFMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -1542,14 +1542,12 @@ ; IEEE-GOODFREXP-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]) ; IEEE-GOODFREXP-NEXT: [[AFN_NO_MD:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_NO_MD]] ; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-GOODFREXP-NEXT: [[AFN_25ULP:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_25ULP]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]) ; IEEE-GOODFREXP-NEXT: [[FAST_NO_MD:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_NO_MD]] ; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-GOODFREXP-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-GOODFREXP-NEXT: [[FAST_25ULP:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_25ULP]], !fpmath !0 +; IEEE-GOODFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) ; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 ; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00 @@ -1620,14 +1618,12 @@ ; IEEE-BADFREXP-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]) ; IEEE-BADFREXP-NEXT: [[AFN_NO_MD:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_NO_MD]] ; IEEE-BADFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-BADFREXP-NEXT: [[AFN_25ULP:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_25ULP]], !fpmath !0 +; IEEE-BADFREXP-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) ; IEEE-BADFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]) ; IEEE-BADFREXP-NEXT: [[FAST_NO_MD:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_NO_MD]] ; IEEE-BADFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; IEEE-BADFREXP-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; IEEE-BADFREXP-NEXT: [[FAST_25ULP:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_25ULP]], !fpmath !0 +; IEEE-BADFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) ; IEEE-BADFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000 ; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00 @@ -1683,14 +1679,12 @@ ; DAZ-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[AFN_NO_MD:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_NO_MD]] ; DAZ-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; DAZ-NEXT: [[AFN_25ULP:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_25ULP]], !fpmath !0 +; DAZ-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]) ; DAZ-NEXT: [[FAST_NO_MD:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_NO_MD]] ; DAZ-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4 -; DAZ-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0 -; DAZ-NEXT: [[FAST_25ULP:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_25ULP]], !fpmath !0 +; DAZ-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4 ; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]]) ; DAZ-NEXT: store volatile float [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4 Index: llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll +++ llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll @@ -1035,8 +1035,7 @@ ; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: ; CODEGEN-IEEE-SDAG: ; %bb.0: ; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0 +; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0 ; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: @@ -1058,18 +1057,11 @@ ; IR-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0 ; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; CODEGEN-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; CODEGEN-DAZ-SDAG: ; %bb.0: -; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0 -; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0 -; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; CODEGEN-DAZ-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: -; CODEGEN-DAZ-GISEL: ; %bb.0: -; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v0, v0 -; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31] +; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: +; CODEGEN-DAZ: ; %bb.0: +; CODEGEN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0 +; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31] ; ; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract: ; IR-DAZ-SDAG: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/rsq.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -406,7 +406,7 @@ ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; @@ -537,7 +537,7 @@ ; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1 ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-DAZ-UNSAFE-NEXT: s_endpgm ; @@ -658,7 +658,7 @@ ; GCN-DAZ-UNSAFE: ; %bb.0: ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32: @@ -711,8 +711,8 @@ ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32: @@ -917,7 +917,7 @@ ; GCN-DAZ-UNSAFE: ; %bb.0: ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32: @@ -969,8 +969,8 @@ ; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; ; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32: