Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -38,6 +38,7 @@ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFDIV16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -304,7 +304,7 @@ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); - setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); @@ -3070,6 +3070,23 @@ GlueChain.getValue(2)); } +SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Src0 = Op.getOperand(0); + SDValue Src1 = Op.getOperand(1); + + SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); + SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); + + SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1); + SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1); + + SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32); + SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0); +} + // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -3263,6 +3280,9 @@ if (VT == MVT::f64) return LowerFDIV64(Op, DAG); + if (VT == MVT::f16) + return LowerFDIV16(Op, DAG); + llvm_unreachable("Unexpected type for fdiv"); } Index: test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.f16.ll +++ test/CodeGen/AMDGPU/fdiv.f16.ll @@ -4,27 +4,39 @@ ; Make sure fdiv is promoted to f32. ; GCN-LABEL: {{^}}fdiv_f16 -; GCN: v_cvt_f32_f16 -; GCN: v_cvt_f32_f16 -; GCN: v_div_scale_f32 -; GCN-DAG: v_div_scale_f32 -; GCN-DAG: v_rcp_f32 -; GCN: v_fma_f32 -; GCN: v_fma_f32 -; GCN: v_mul_f32 -; GCN: v_fma_f32 -; GCN: v_fma_f32 -; GCN: v_fma_f32 -; GCN: v_div_fmas_f32 -; GCN: v_div_fixup_f32 -; GCN: v_cvt_f16_f32 +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 +; SI-DAG: v_rcp_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +; SI: v_cvt_f16_f32 + +; VI: buffer_load_ushort [[LHS:v[0-9]+]] +; VI: buffer_load_ushort [[RHS:v[0-9]+]] + +; VI-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] +; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] + +; VI-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]] +; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]] +; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] +; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] +; VI: buffer_store_short [[RESULT]] define void @fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) { entry: - %a.val = load half, half addrspace(1)* %a - %b.val = load half, half addrspace(1)* %b + %a.val = load volatile half, half addrspace(1)* %a + %b.val = load volatile half, half addrspace(1)* %b %r.val = fdiv half %a.val, %b.val store half %r.val, half addrspace(1)* %r ret void