Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1283,6 +1283,11 @@ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) >; + +def : Pat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +>; } let Predicates = [FP16Denormals] in { @@ -1290,6 +1295,11 @@ (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) >; + +def : Pat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) +>; } let Predicates = [NoFP32Denormals] in { @@ -1320,11 +1330,6 @@ >; } -def : Pat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) ->; - // Allow integer inputs class ExpPattern : Pat< Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -211,7 +211,7 @@ ; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; VI-NOT: v_and_b32 -; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} +; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -232,7 +232,7 @@ ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} -; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} +; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -251,7 +251,7 @@ ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} -; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1]{{$}} ; GCN: buffer_store_dword define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -272,7 +272,7 @@ ; VI-DAG: v_max_f16_e32 [[REG0:v[0-9]+]], [[FNEG]], [[FNEG]] ; VI-NOT: 0xffff -; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -289,7 +289,7 @@ ; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} ; VI-NOT: v_and_b32 -; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}} +; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}} ; GFX9: buffer_store_dword [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { %val = bitcast i32 %val.arg to <2 x half> Index: test/CodeGen/AMDGPU/fcanonicalize.ll =================================================================== --- test/CodeGen/AMDGPU/fcanonicalize.ll +++ test/CodeGen/AMDGPU/fcanonicalize.ll @@ -5,6 +5,7 @@ declare double @llvm.fabs.f64(double) #0 declare double @llvm.canonicalize.f64(double) #0 declare half @llvm.canonicalize.f16(half) #0 +declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0 declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32: @@ -453,6 +454,32 @@ ret void } +; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush_gfx8: +; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 0x3c00 +; GCN-DAG: v_mul_f16_sdwa v{{[0-9]+}}, [[ONE]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GCN-DAG: v_mul_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_v2f16_flush_gfx8(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #4 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %id + %v = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 + %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) + %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 2 + ret void +} + +; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_flush_gfx9: +; GCN-DAG: v_pk_mul_f16 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_v2f16_flush_gfx9(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #6 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %id + %v = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 + %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) + %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 2 + ret void +} + ; GCN-LABEL: {{^}}test_canonicalize_value_f64_denorm: ; GCN: v_max_f64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_canonicalize_value_f64_denorm(double addrspace(1)* %arg, double addrspace(1)* %out) #5 { @@ -489,9 +516,22 @@ ret void } +; GCN-LABEL: {{^}}test_canonicalize_value_v2f16_denorm: +; GCN: v_pk_max_f16 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(<2 x half> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) #5 { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %arg, i32 %id + %v = load <2 x half>, <2 x half> addrspace(1)* %gep, align 4 + %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v) + %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id + store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 2 + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } attributes #4 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" "target-cpu"="tonga" } attributes #5 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" "target-cpu"="gfx900" } +attributes #6 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" "target-cpu"="gfx900" }