Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -36,6 +36,12 @@ using namespace llvm; +// -amdgpu-fast-fdiv - Command line option to enable the IEEE 754 fpdiv computation in the passes. +static cl::opt EnableAMDGPUFastFDIV( + "amdgpu-fast-fdiv", + cl::desc("Enable IEEE 754 fpdiv computation"), + cl::init(false)); + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -2021,8 +2027,13 @@ } SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) + const SDNodeFlags *Flags = Op->getFlags(); + if ( Flags->hasAllowReciprocal()) + { + if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) return FastLowered; + } + // This uses v_rcp_f32 which does not handle denormals. Let this hit a // selection error for now rather than do something incorrect. @@ -2033,32 +2044,63 @@ SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + // Generate the efficient but not precise fpdiv32 if fast_math is ON + if (EnableAMDGPUFastFDIV) { - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - // TODO: Should this propagate fast-math-flags? + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + // TODO: Should this propagate fast-math-flags? - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + } else { + // Generates more precise fpdiv32. + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); + + + SDValue d_scaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); + SDValue n_scaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); + + SDValue approx_rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, d_scaled); + + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, d_scaled); + + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, approx_rcp, One); + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, approx_rcp, approx_rcp); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, n_scaled, Fma1); + + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, n_scaled); + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, n_scaled); + + SDValue Scale = n_scaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); + } } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {