Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2010,32 +2010,62 @@ SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + // Generate the efficient but not precise fpdiv32 if fast_math is ON + const SDNodeFlags *Flags = Op.getNode()->getFlags(); + if (Flags->hasUnsafeAlgebra()) { + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + // TODO: Should this propagate fast-math-flags? + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + } + + // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue d_scaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + SDValue n_scaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue approx_rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, d_scaled); - // TODO: Should this propagate fast-math-flags? + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, d_scaled); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, approx_rcp, One); + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, approx_rcp, approx_rcp); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, n_scaled, Fma1); - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, n_scaled); + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, n_scaled); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue Scale = n_scaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { Index: test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.ll +++ test/CodeGen/AMDGPU/fdiv.ll @@ -6,7 +6,7 @@ ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate ; instruction groups. -; FUNC-LABEL: {{^}}fdiv_f32: +; FUNC-LABEL: {{^}}fdiv_f32_fast_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS @@ -14,16 +14,15 @@ ; SI-DAG: v_rcp_f32 ; SI-DAG: v_mul_f32 -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) { entry: - %0 = fdiv float %a, %b + %0 = fdiv fast float %a, %b store float %0, float addrspace(1)* %out ret void } - -; FUNC-LABEL: {{^}}fdiv_v2f32: +; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS @@ -33,14 +32,14 @@ ; SI-DAG: v_mul_f32 ; SI-DAG: v_rcp_f32 ; SI-DAG: v_mul_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { entry: - %0 = fdiv <2 x float> %a, %b + %0 = fdiv fast <2 x float> %a, %b store <2 x float> %0, <2 x float> addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}fdiv_v4f32: +; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -58,6 +57,116 @@ ; SI-DAG: v_mul_f32 ; SI-DAG: v_rcp_f32 ; SI-DAG: v_mul_f32 +define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 + %a = load <4 x float>, <4 x float> addrspace(1) * %in + %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr + %result = fdiv fast <4 x float> %a, %b + store <4 x float> %result, <4 x float> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}fdiv_f32: + +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fdiv float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + + + +; FUNC-LABEL: {{^}}fdiv_v2f32: + +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +entry: + %0 = fdiv <2 x float> %a, %b + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + + +; FUNC-LABEL: {{^}}fdiv_v4f32: + +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_mul_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_fma_f32 +; SI_DAG: v_div_fmas_f32 +; SI_DAG: v_div_fixup_f32 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in Index: test/CodeGen/AMDGPU/frem.ll =================================================================== --- test/CodeGen/AMDGPU/frem.ll +++ test/CodeGen/AMDGPU/frem.ll @@ -5,11 +5,12 @@ ; FUNC-LABEL: {{^}}frem_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16 -; GCN-DAG: v_cmp -; GCN-DAG: v_mul_f32 +; GCN: v_div_scale_f32 ; GCN: v_rcp_f32_e32 +; GCN: v_fma_f32 ; GCN: v_mul_f32_e32 -; GCN: v_mul_f32_e32 +; GCN: v_div_fmas_f32 +; GCN: v_div_fixup_f32 ; GCN: v_trunc_f32_e32 ; GCN: v_mad_f32 ; GCN: s_endpgm