Index: lib/Target/R600/SIISelLowering.h =================================================================== --- lib/Target/R600/SIISelLowering.h +++ lib/Target/R600/SIISelLowering.h @@ -27,6 +27,9 @@ SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/R600/SIISelLowering.cpp =================================================================== --- lib/Target/R600/SIISelLowering.cpp +++ lib/Target/R600/SIISelLowering.cpp @@ -220,6 +220,8 @@ setOperationAction(ISD::FNEG, MVT::f64, Expand); setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -619,6 +621,7 @@ } case ISD::SELECT: return LowerSELECT(Op, DAG); + case ISD::FDIV: return LowerFDIV(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { @@ -907,6 +910,79 @@ return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } +static SDValue performUnsafeFDIV(SDValue Op, SelectionDAG &DAG) { + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + EVT VT = Op.getValueType(); + + if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { + if (CLHS->isExactlyValue(1.0)) { + + // 1.0 / sqrt(x) -> rsq(x) + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + } + } + + // Turn into multiply by the reciprocal + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); +} + +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getTarget().Options.UnsafeFPMath) + return performUnsafeFDIV(Op, DAG); + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); + + const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} + +SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { + return SDValue(); +} + +SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFDIV32(Op, DAG); + + if (VT == MVT::f64) + return LowerFDIV64(Op, DAG); + + llvm_unreachable("Unexpected type for fdiv"); +} + SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast(Op); Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1788,11 +1788,13 @@ // VOP1 Patterns //===----------------------------------------------------------------------===// -def : RcpPat; def : RcpPat; -defm : RsqPat; defm : RsqPat; +let Predicates = [UnsafeFPMath] in { +defm : RsqPat; +} + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// @@ -2248,11 +2250,6 @@ >; def : Pat< - (fdiv f32:$src0, f32:$src1), - (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1)) ->; - -def : Pat< (fdiv f64:$src0, f64:$src1), (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0)) >; Index: test/CodeGen/R600/fdiv.ll =================================================================== --- test/CodeGen/R600/fdiv.ll +++ test/CodeGen/R600/fdiv.ll @@ -1,20 +1,37 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK -; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; These tests check that fdiv is expanded correctly and also test that the ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate ; instruction groups. -; R600-CHECK: @fdiv_v2f32 -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; SI-CHECK: @fdiv_v2f32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 +; FUNC-LABEL: @fdiv_f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +entry: + %0 = fdiv float %a, %b + store float %0, float addrspace(1)* %out + ret void +} + + + +; FUNC-LABEL: @fdiv_v2f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { entry: %0 = fdiv <2 x float> %a, %b @@ -22,24 +39,24 @@ ret void } -; R600-CHECK: @fdiv_v4f32 -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; SI-CHECK: @fdiv_v4f32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 -; SI-CHECK-DAG: V_RCP_F32 -; SI-CHECK-DAG: V_MUL_F32 +; FUNC-LABEL: @fdiv_v4f32 +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS +; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS + +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 +; SI-DAG: V_RCP_F32 +; SI-DAG: V_MUL_F32 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float> addrspace(1) * %in Index: test/CodeGen/R600/llvm.AMDGPU.rcp.ll =================================================================== --- test/CodeGen/R600/llvm.AMDGPU.rcp.ll +++ test/CodeGen/R600/llvm.AMDGPU.rcp.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone @@ -24,7 +25,15 @@ } ; FUNC-LABEL: @rcp_pat_f32 -; SI: V_RCP_F32_e32 +; SI-UNSAFE-NOT: V_MUL_F32 +; SI-UNSAFE: V_RCP_F32_e32 +; SI-UNSAFE-NOT: V_MUL_F32 + +; Check for surrounding multiplies the correct divide has. +; SI-SAFE: V_MUL_F32 +; SI-SAFE: V_RCP_F32_e32 +; SI-SAFE: V_MUL_F32 + define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 @@ -40,7 +49,9 @@ } ; FUNC-LABEL: @rsq_rcp_pat_f32 -; SI: V_RSQ_F32_e32 +; SI-UNSAFE: V_RSQ_F32_e32 +; SI-SAFE: V_SQRT_F32_e32 +; SI-SAFE: V_RCP_F32_e32 define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone Index: test/CodeGen/R600/rsq.ll =================================================================== --- test/CodeGen/R600/rsq.ll +++ test/CodeGen/R600/rsq.ll @@ -1,10 +1,12 @@ -; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare float @llvm.sqrt.f32(float) nounwind readnone declare double @llvm.sqrt.f64(double) nounwind readnone ; SI-LABEL: @rsq_f32 -; SI: V_RSQ_F32_e32 +; SI-UNSAFE: V_RSQ_F32_e32 +; SI-SAFE: V_SQRT_F32 ; SI: S_ENDPGM define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %val = load float addrspace(1)* %in, align 4