Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7562,6 +7562,14 @@ return DAG.getNode(ISD::BITCAST, DL, VT, Res); } + +static bool correctlyRoundedDivSqrt(SelectionDAG &DAG) { + MachineFunction &MF = DAG.getMachineFunction(); + const Function &F = MF.getFunction(); + auto Attr = F.getFnAttribute("correctly-rounded-divide-sqrt-fp-math"); + return (Attr.getValueAsString() == "true"); +} + // Catch division cases where we can use shortcuts with rcp and rsq // instructions. SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, @@ -7573,7 +7581,8 @@ const SDNodeFlags Flags = Op->getFlags(); bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) + if (!Unsafe && VT == MVT::f32 && (correctlyRoundedDivSqrt(DAG) || + hasFP32Denormals(DAG.getMachineFunction()))) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -73,6 +73,68 @@ ret void } +; FUNC-LABEL: {{^}}fdiv_f32_correctly_rounded_divide_sqrt: + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX10: s_denorm_mode 15 +; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX10: s_denorm_mode 12 +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], + +define amdgpu_kernel void @fdiv_f32_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #3 { +entry: + %fdiv = fdiv float 1.000000e+00, %a + store float %fdiv, float addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}fdiv_f32_denorms_correctly_rounded_divide_sqrt: + +; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] +; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] + +; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; PREGFX10-NOT: s_setreg +; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10-NOT: s_setreg + +; GFX10-NOT: s_denorm_mode +; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] +; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] +; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] +; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]] +; GFX10-NOT: s_denorm_mode + +; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] +; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], +define amdgpu_kernel void @fdiv_f32_denorms_correctly_rounded_divide_sqrt(float addrspace(1)* %out, float %a) #4 { +entry: + %fdiv = fdiv float 1.000000e+00, %a + store float %fdiv, float addrspace(1)* %out + ret void +} + + ; FUNC-LABEL: {{^}}fdiv_25ulp_f32: ; GCN: v_cndmask_b32 ; GCN: v_mul_f32 @@ -287,5 +349,7 @@ attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" } attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" } +attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } +attributes #4 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="true" "target-features"="+fp32-denormals,-flat-for-global" } !0 = !{float 2.500000e+00}