Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -359,6 +359,10 @@ return FP64FP16Denormals; } + bool supportsMinMaxDenormModes() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + bool hasFPExceptions() const { return FPExceptions; } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4624,8 +4624,8 @@ return DAG.isKnownNeverNaN(Op); } -static bool isCanonicalized(SDValue Op, const SISubtarget *ST, - unsigned MaxDepth=5) { +static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, + const SISubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in // canonical form. @@ -4663,7 +4663,7 @@ case ISD::FNEG: case ISD::FABS: return (MaxDepth > 0) && - isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1); + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); case ISD::FSIN: case ISD::FCOS: @@ -4672,16 +4672,19 @@ // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. // For such targets need to check their input recursively. - // TODO: on GFX9+ we could return true without checking provided no-nan - // mode, since canonicalization is also used to quiet sNaNs. case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINNAN: case ISD::FMAXNAN: + if (ST->supportsMinMaxDenormModes() && + DAG.isKnownNeverNaN(Op.getOperand(0)) && + DAG.isKnownNeverNaN(Op.getOperand(1))) + return true; + return (MaxDepth > 0) && - isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) && - isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1); + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); case ISD::ConstantFP: { auto F = cast(Op)->getValueAPF(); @@ -4704,7 +4707,7 @@ bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && - isCanonicalized(N0, getSubtarget())) + isCanonicalized(DAG, N0, getSubtarget())) return N0; return SDValue(); Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -347,7 +347,9 @@ } ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32: -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; GFX9: flat_store_dword v[{{[0-9:]+}}], [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -388,9 +390,11 @@ } ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: -; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] -; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -402,9 +406,11 @@ } ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32: -; GCN: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GFX9: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} +; VI: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] ; GCN: flat_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id