Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -42,6 +42,7 @@ SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFMAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -292,6 +292,8 @@ setOperationAction(ISD::FCOS, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + if (Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Custom); if (Subtarget->has16BitInsts()) { setOperationAction(ISD::Constant, MVT::i16, Legal); @@ -362,7 +364,6 @@ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); - setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); @@ -1956,6 +1957,7 @@ return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::FMAD: return LowerFMAD(Op, DAG); case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { @@ -3277,6 +3279,14 @@ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); } +SDValue SITargetLowering::LowerFMAD(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc SL(Op); + + return DAG.getNode(ISD::FMA, SL, VT, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); +} + + SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -1,6 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=DENORM %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=NODENORM %s ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT @@ -158,3 +160,35 @@ store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; GCN-LABEL: {{^}}fdiv_enable_denormals +; DENORM: v_fma_f32 +define amdgpu_kernel void @fdiv_enable_denormals(i8 addrspace(1)* nocapture readonly %arg) #0 { +bb: + %tmp = load i8, i8 addrspace(1)* null, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = sdiv i32 %tmp1, %tmp4 + %tmp6 = trunc i32 %tmp5 to i8 + store i8 %tmp6, i8 addrspace(1)* null, align 1 + ret void +} + +; GCN-LABEL: {{^}}fdiv_disable_denormals +; NODENORM: v_mad_f32 +define amdgpu_kernel void @fdiv_disable_denormals(i8 addrspace(1)* nocapture readonly %arg) #0 { +bb: + %tmp = load i8, i8 addrspace(1)* null, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = sdiv i32 %tmp1, %tmp4 + %tmp6 = trunc i32 %tmp5 to i8 + store i8 %tmp6, i8 addrspace(1)* null, align 1 + ret void +} + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }