Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,9 +42,9 @@ field bits<32> Inst = 0xffffffff; } -def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; -def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; +def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps ; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1277,16 +1277,37 @@ (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) >; +let Predicates = [FP16Denormals], AddedComplexity = 1 in { +def : Pat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) +>; +} + def : Pat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) >; +let Predicates = [FP32Denormals], AddedComplexity = 1 in { +def : Pat< + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0) +>; +} + def : Pat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) >; +let Predicates = [FP64Denormals], AddedComplexity = 1 in { +def : Pat< + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0) +>; +} + def : Pat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) Index: test/CodeGen/AMDGPU/fcanonicalize-denorms.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fcanonicalize-denorms.ll @@ -0,0 +1,48 @@ +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals,+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-DENORM %s + +declare float @llvm.canonicalize.f32(float) #0 +declare double @llvm.canonicalize.f64(double) #0 +declare half @llvm.canonicalize.f16(half) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +; GCN-LABEL: {{^}}test_canonicalize_value_f64: +; GCN-FLUSH: v_mul_f64 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] +; GCN-DENORM: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +define amdgpu_kernel void @test_canonicalize_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id + %v = load double, double addrspace(1)* %gep, align 8 + %canonicalized = tail call double @llvm.canonicalize.f64(double %v) + %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id + store double %canonicalized, double addrspace(1)* %gep2, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_canonicalize_value_f32: +; GCN-FLUSH: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GCN-DENORM: v_max_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id + %v = load float, float addrspace(1)* %gep, align 4 + %canonicalized = tail call float @llvm.canonicalize.f32(float %v) + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id + store float %canonicalized, float addrspace(1)* %gep2, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_canonicalize_value_f16: +; GCN-FLUSH: v_mul_f16_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +; GCN-DENORM: v_max_f16_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id + %v = load half, half addrspace(1)* %gep, align 2 + %canonicalized = tail call half @llvm.canonicalize.f16(half %v) + %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id + store half %canonicalized, half addrspace(1)* %gep2, align 2 + ret void +} + +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -4,7 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GCN-FLUSH %s ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -129,7 +130,8 @@ ; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32: ; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]], -; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] +; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] +; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]] ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] ; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) { @@ -223,7 +225,8 @@ } ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: -; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}} +; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}} +; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -250,7 +253,8 @@ } ; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32: -; GCN: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| +; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -377,7 +381,8 @@ ; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: ; GCN: v_min_f32_e32 [[V0:v[0-9]+]], 0x7f800001, v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[V0]] +; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[V0]], [[V0]] ; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: -; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} +; GCN: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; GCN: buffer_store_short [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out @@ -19,7 +19,7 @@ } ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16: -; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}} +; GCN: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} ; GCN: buffer_store_short [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { %val = bitcast i16 %val.arg to half @@ -29,7 +29,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16: -; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}| +; GCN: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| ; GCN: buffer_store_short [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out @@ -40,7 +40,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16: -; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}| +; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| ; GCN: buffer_store_short [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out @@ -52,7 +52,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16: -; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}} +; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} ; GCN: buffer_store_short [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out @@ -207,9 +207,8 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16: -; VI: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 -; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; VI-NOT: v_and_b32 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} @@ -227,9 +226,8 @@ ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: ; VI-DAG: v_bfe_u32 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} -; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 -; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; VI-NOT: 0xffff ; VI: v_or_b32 @@ -247,10 +245,9 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: -; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 ; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} @@ -269,10 +266,10 @@ ; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: -; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 -; VI-DAG: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; VI-DAG: v_mul_f16_sdwa [[REG1:v[0-9]+]], v[[CONST1]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] +; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} +; VI: v_lshrrev_b32_e32 [[FNEGHI:v[0-9]+]], 16, [[FNEG]] +; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], [[FNEG]], [[FNEGHI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]] ; VI-NOT: 0xffff ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} @@ -288,9 +285,8 @@ } ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16: -; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00 -; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], [[ONE]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}} +; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} ; VI-NOT: v_and_b32 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}} Index: test/CodeGen/AMDGPU/fcanonicalize.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.ll +++ test/CodeGen/AMDGPU/fcanonicalize.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s declare float @llvm.fabs.f32(float) #0 declare float @llvm.canonicalize.f32(float) #0 @@ -203,7 +203,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64: -; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}} +; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out @@ -213,7 +213,7 @@ } ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64: -; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}} +; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 { %canonicalized = call double @llvm.canonicalize.f64(double %val) @@ -222,7 +222,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64: -; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}| +; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], |{{v\[[0-9]+:[0-9]+\]}}|, |{{v\[[0-9]+:[0-9]+\]}}| ; GCN: buffer_store_dwordx2 [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out @@ -233,7 +233,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64: -; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}| +; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]\]]], -|{{v\[[0-9]+:[0-9]+\]}}|, -|{{v\[[0-9]+:[0-9]+\]}}| ; GCN: buffer_store_dwordx2 [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out @@ -245,7 +245,7 @@ } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64: -; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}} +; GCN: v_max_f64 [[REG:v\[[0-9]+:[0-9]+\]]], -{{v\[[0-9]+:[0-9]+\]}}, -{{v\[[0-9]+:[0-9]+\]}} ; GCN: buffer_store_dwordx2 [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 { %val = load double, double addrspace(1)* %out