Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -6747,20 +6747,13 @@ case ISD::FSQRT: case ISD::FDIV: case ISD::FREM: - case AMDGPUISD::FMUL_LEGACY: - case AMDGPUISD::FMAD_FTZ: - return true; case ISD::FP_ROUND: - return Op.getValueType().getScalarType() != MVT::f16 || - Subtarget->hasFP16Denormals(); - case ISD::FP_EXTEND: - return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || - Subtarget->hasFP16Denormals(); - case ISD::FP16_TO_FP: case ISD::FP_TO_FP16: - return Subtarget->hasFP16Denormals(); + case AMDGPUISD::FMUL_LEGACY: + case AMDGPUISD::FMAD_FTZ: + return true; // It can/will be lowered or combined as a bit operation. // Need to check their input recursively to handle. Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -215,6 +215,22 @@ ret void } +; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16: +; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_mul +; GCN-NOT: v_max +; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(half addrspace(1)* %arg, float addrspace(1)* %out) #2 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %load = load half, half addrspace(1)* %gep, align 2 + %v = fpext half %load to float + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + ; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64: ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] ; GCN-NOT: v_mul @@ -233,8 +249,9 @@ ; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32: ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_max +; GCN-NOT: v_mul ; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -246,6 +263,22 @@ ret void } +; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16: +; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} +; GCN-NOT: v_max +; GCN-NOT: v_mul +; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] +define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %load = load float, float addrspace(1)* %gep, align 4 + %v = fptrunc float %load to half + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id + store half %canonicalized, half addrspace(1)* %gep2, align 2 + ret void +} + ; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: ; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} @@ -738,3 +771,4 @@ attributes #0 = { nounwind readnone } attributes #1 = { "no-nans-fp-math"="true" } +attributes #2 = { "target-features"="-fp64-fp16-denormals" }