Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -266,6 +266,7 @@ DIV_SCALE, DIV_FMAS, DIV_FIXUP, + FMAD_FTZ, TRIG_PREOP, // 1 ULP max error for f64 // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1293,7 +1293,9 @@ SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa); + SDValue fr = Subtarget->hasFP32Denormals() ? + DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, FltVT, fqneg, fb, fa) : + DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); @@ -3395,6 +3397,7 @@ NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(FMAD_FTZ) NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -194,6 +194,9 @@ // Denominator, src2 = Numerator). def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; +def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>; + + // Look Up 2.0 / pi src0 with segment select src1[4:0] def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -362,7 +362,6 @@ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); - setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -497,6 +497,15 @@ >; } +multiclass FMACPat { + def : Pat < + (vt (AMDGPUfmad_ftz vt:$src0, vt:$src1, vt:$src2)), + (inst $src0, $src1, $src2) + >; +} + +defm : FMACPat; + defm : FMADPat ; defm : FMADPat ; Index: test/CodeGen/AMDGPU/udiv.ll =================================================================== --- test/CodeGen/AMDGPU/udiv.ll +++ test/CodeGen/AMDGPU/udiv.ll @@ -1,6 +1,8 @@ ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=DENORM %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=NODENORM %s ; FUNC-LABEL: {{^}}udiv_i32: ; EG-NOT: SETGE_INT @@ -158,3 +160,35 @@ store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16 ret void } + +; GCN-LABEL: {{^}}fdiv_enable_denormals +; DENORM: v_fma_f32 +define amdgpu_kernel void @fdiv_enable_denormals(i8 addrspace(1)* nocapture readonly %arg) #0 { +bb: + %tmp = load i8, i8 addrspace(1)* null, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = sdiv i32 %tmp1, %tmp4 + %tmp6 = trunc i32 %tmp5 to i8 + store i8 %tmp6, i8 addrspace(1)* null, align 1 + ret void +} + +; GCN-LABEL: {{^}}fdiv_disable_denormals +; NODENORM: v_mad_f32 +define amdgpu_kernel void @fdiv_disable_denormals(i8 addrspace(1)* nocapture readonly %arg) #0 { +bb: + %tmp = load i8, i8 addrspace(1)* null, align 1 + %tmp1 = sext i8 %tmp to i32 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = sdiv i32 %tmp1, %tmp4 + %tmp6 = trunc i32 %tmp5 to i8 + store i8 %tmp6, i8 addrspace(1)* null, align 1 + ret void +} + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }