diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -226,6 +226,26 @@ DSTCLAMP.ENABLE, (i32 (IMPLICIT_DEF))))) >; + + def : GCNPat < + (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (mixlo_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) + >; + + def : GCNPat < + (build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE, + VGPR_32:$elt0)) + >; } let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -3,17 +3,17 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-IEEE %s ; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6,GFX6-FLUSH %s -; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8,GFX8-FLUSH %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s define half @v_fdiv_f16(half %a, half %b) { ; GFX6-IEEE-LABEL: v_fdiv_f16: @@ -57,25 +57,43 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_fdiv_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_fdiv_f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -83,11 +101,9 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b @@ -172,25 +188,43 @@ ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: v_fdiv_f16_ulp25: -; GFX89: ; %bb.0: -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX89-NEXT: v_rcp_f32_e32 v2, v2 -; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: v_fdiv_f16_ulp25: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX8-NEXT: v_rcp_f32_e32 v2, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_f16_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_f16_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -198,11 +232,9 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b, !fpmath !0 @@ -659,25 +691,41 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v5, v7, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v5, v4, v2 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16: ; GFX10: ; %bb.0: @@ -685,15 +733,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -705,15 +749,12 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -877,25 +918,41 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_v2f16_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v5, v7, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX9-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX9-NEXT: v_div_fixup_f16 v1, v5, v4, v2 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_fdiv_v2f16_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_v2f16_ulp25: ; GFX10: ; %bb.0: @@ -903,15 +960,11 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -923,15 +976,12 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1030,37 +1080,49 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_rcp_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1071,15 +1133,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1178,37 +1237,49 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_neg_rcp_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_neg_rcp_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rcp_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1219,15 +1290,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1337,39 +1405,52 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_v2f16_fabs: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_fabs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1378,21 +1459,17 @@ ; GFX11-LABEL: v_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) @@ -1501,39 +1578,52 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_neg_rcp_v2f16_fabs: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rcp_v2f16_fabs: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1542,21 +1632,17 @@ ; GFX11-LABEL: v_neg_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) @@ -1832,37 +1918,49 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rcp_v2f16_ulp25: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rcp_v2f16_ulp25: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1873,15 +1971,12 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -2174,25 +2269,45 @@ ; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-FLUSH-NEXT: ; return to shader part epilog ; -; GFX89-LABEL: s_fdiv_f16: -; GFX89: ; %bb.0: -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX89-NEXT: v_rcp_f32_e32 v0, v0 -; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX89-NEXT: v_mov_b32_e32 v1, s1 -; GFX89-NEXT: v_div_fixup_f16 v0, v0, v1, s0 -; GFX89-NEXT: v_readfirstlane_b32 s0, v0 -; GFX89-NEXT: ; return to shader part epilog +; GFX8-LABEL: s_fdiv_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX8-NEXT: v_rcp_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-IEEE-LABEL: s_fdiv_f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-IEEE-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-IEEE-NEXT: ; return to shader part epilog +; +; GFX9-FLUSH-LABEL: s_fdiv_f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0 +; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-FLUSH-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fdiv_f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2200,11 +2315,9 @@ ; GFX11-LABEL: s_fdiv_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: ; return to shader part epilog @@ -2427,27 +2540,45 @@ ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_fdiv_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_cvt_f32_f16_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, s2 -; GFX9-NEXT: v_rcp_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_div_fixup_f16 v1, v1, v2, s2 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX9-IEEE-LABEL: s_fdiv_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX9-IEEE-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX9-IEEE-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0 +; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v1, v2, s2 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-IEEE-NEXT: ; return to shader part epilog +; +; GFX9-FLUSH-LABEL: s_fdiv_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 +; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-FLUSH-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fdiv_v2f16: ; GFX10: ; %bb.0: @@ -2455,14 +2586,10 @@ ; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, s3 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -2475,14 +2602,11 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v0, v2, v0 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -2811,40 +2935,54 @@ ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_rsq_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_sqrt_f16_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 -; GFX9-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 -; GFX9-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX9-IEEE-LABEL: s_rsq_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 +; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-IEEE-NEXT: ; return to shader part epilog +; +; GFX9-FLUSH-LABEL: s_rsq_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 +; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-FLUSH-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_rsq_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX10-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -2856,17 +2994,14 @@ ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -3364,39 +3499,52 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rsq_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 -; GFX9-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_rsq_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_rsq_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rsq_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 @@ -3407,7 +3555,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -3415,10 +3562,8 @@ ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -3532,39 +3677,52 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_neg_rsq_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_sqrt_f16_e32 v1, v0 -; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX9-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 -; GFX9-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 -; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-IEEE-LABEL: v_neg_rsq_v2f16: +; GFX9-IEEE: ; %bb.0: +; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 +; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 +; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_neg_rsq_v2f16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 ; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 @@ -3575,7 +3733,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -3583,10 +3740,8 @@ ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -3602,3 +3757,10 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) !0 = !{float 2.500000e+00} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10-FLUSH: {{.*}} +; GFX10-IEEE: {{.*}} +; GFX11-FLUSH: {{.*}} +; GFX11-IEEE: {{.*}} +; GFX8-FLUSH: {{.*}} +; GFX8-IEEE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -82,11 +82,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -102,11 +100,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm @@ -122,14 +118,11 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -125,13 +125,10 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1 @@ -148,13 +145,10 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 @@ -171,19 +165,15 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1748,23 +1738,18 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v4, v4 ; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1 @@ -1782,23 +1767,18 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5 -; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v4, v4 ; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1 @@ -1816,35 +1796,28 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX11-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2133,42 +2106,33 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5 ; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX9-NEXT: v_rcp_f32_e32 v6, v6 +; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX9-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX9-NEXT: v_trunc_f16_e32 v6, v6 ; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5 ; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0 @@ -2186,42 +2150,33 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 -; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX10-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX10-NEXT: v_rcp_f32_e32 v6, v6 +; GFX10-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v6, v6 ; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0 @@ -2239,57 +2194,45 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v5, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX11-NEXT: v_rcp_f32_e32 v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX11-NEXT: v_rcp_f32_e32 v6, v6 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX11-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 -; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX11-NEXT: v_trunc_f16_e32 v5, v5 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -2202,6 +2202,195 @@ ret <4 x half> %cvt.result } +define half @mixlo_fptrunc(float %a, float %b) #0 { +; GFX1100-LABEL: mixlo_fptrunc: +; GFX1100: ; %bb.0: ; %.entry +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, 0 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: mixlo_fptrunc: +; GFX900: ; %bb.0: ; %.entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, 0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: mixlo_fptrunc: +; GFX906: ; %bb.0: ; %.entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, 0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: mixlo_fptrunc: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: mixlo_fptrunc: +; SDAG-CI: ; %bb.0: ; %.entry +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: mixlo_fptrunc: +; GISEL-CI: ; %bb.0: ; %.entry +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +.entry: + %mul = fmul float %a, %b + %trunc = fptrunc float %mul to half + ret half %trunc +} + +define half @mixlo_fptrunc_no_flush(float %a, float %b) { +; GFX1100-LABEL: mixlo_fptrunc_no_flush: +; GFX1100: ; %bb.0: ; %.entry +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, 0 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: mixlo_fptrunc_no_flush: +; GFX900: ; %bb.0: ; %.entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: mixlo_fptrunc_no_flush: +; GFX906: ; %bb.0: ; %.entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, 0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: mixlo_fptrunc_no_flush: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: mixlo_fptrunc_no_flush: +; SDAG-CI: ; %bb.0: ; %.entry +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: mixlo_fptrunc_no_flush: +; GISEL-CI: ; %bb.0: ; %.entry +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +.entry: + %mul = fmul float %a, %b + %trunc = fptrunc float %mul to half + ret half %trunc +} + +define half @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 { +; GFX1100-LABEL: mixlo_fptrunc_abs_src_mod: +; GFX1100: ; %bb.0: ; %.entry +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, |v0|, v1, 0 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: mixlo_fptrunc_abs_src_mod: +; GFX900: ; %bb.0: ; %.entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, |v0|, v1, 0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: mixlo_fptrunc_abs_src_mod: +; GFX906: ; %bb.0: ; %.entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, |v0|, v1, 0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: mixlo_fptrunc_abs_src_mod: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: mixlo_fptrunc_abs_src_mod: +; SDAG-CI: ; %bb.0: ; %.entry +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: mixlo_fptrunc_abs_src_mod: +; GISEL-CI: ; %bb.0: ; %.entry +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +.entry: + %a.fabs = call float @llvm.fabs.f32(float %a) + %mul = fmul float %a.fabs, %b + %trunc = fptrunc float %mul to half + ret half %trunc +} + +define half @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 { +; GFX1100-LABEL: mixlo_fptrunc_neg_src_mod: +; GFX1100: ; %bb.0: ; %.entry +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, -v0, v1, 0 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: mixlo_fptrunc_neg_src_mod: +; GFX900: ; %bb.0: ; %.entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, -v0, v1, 0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: mixlo_fptrunc_neg_src_mod: +; GFX906: ; %bb.0: ; %.entry +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, -v0, v1, 0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: mixlo_fptrunc_neg_src_mod: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: mixlo_fptrunc_neg_src_mod: +; SDAG-CI: ; %bb.0: ; %.entry +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: mixlo_fptrunc_neg_src_mod: +; GISEL-CI: ; %bb.0: ; %.entry +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] +.entry: + %a.fneg = fneg float %a + %mul = fmul float %a.fneg, %b + %trunc = fptrunc float %mul to half + ret half %trunc +} + +declare float @llvm.fabs.f32(float) #1 + declare half @llvm.minnum.f16(half, half) #1 declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 declare <3 x half> @llvm.minnum.v3f16(<3 x half>, <3 x half>) #1