diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -1,12 +1,32 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=CI %s -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -16,13 +36,34 @@ ret <2 x half> %vec.result } -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00 -; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, 0x3c00, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, 1.0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -32,12 +73,33 @@ ret <2 x half> %vec.result } -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, v3 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -48,12 +110,31 @@ ret <2 x half> %vec.result } -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -65,12 +146,31 @@ ret i32 %shr } -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -82,12 +182,31 @@ ret i32 %shr } -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: -; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} -; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -99,11 +218,30 @@ ret <2 x half> %vec.result } -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: -; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -115,15 +253,40 @@ ret <2 x half> %vec.result } - -; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} -; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { +; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX9-NEXT: global_store_short v[0:1], v3, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; VI-NEXT: flat_store_short v[0:1], v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_sdwa v0, v0, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -1,28 +1,71 @@ -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX906 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX900 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -enable-misched=false < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX906 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX900 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=CI %s -; GCN-LABEL: mixlo_simple: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2{{$}} -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2{{$}} -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mac_f32_e32 -; CIVI: v_cvt_f16_f32_e32 define half @mixlo_simple(float %src0, float %src1, float %src2) #0 { +; GFX906-LABEL: mixlo_simple: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: mixlo_simple: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: mixlo_simple: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: mixlo_simple: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) %cvt.result = fptrunc float %result to half ret half %cvt.result } -; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f16lo: -; GFX900: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} -; GFX906: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} -; CI: v_mac_f32 -; CIVI: v_cvt_f16_f32 define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -31,14 +74,35 @@ ret half %cvt.result } -; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]{{$}} -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mac_f32 define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -46,14 +110,35 @@ ret half %cvt.result } -; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} -; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mac_f32_e32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]$}} define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_e64 v0, v2 clamp +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mac_f32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -63,15 +148,37 @@ ret half %clamp } -; GCN-LABEL: {{^}}v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp{{$}} -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 { +; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) @@ -81,20 +188,64 @@ ret half %cvt.result } -; FIXME: Should abe able to avoid extra register because first +; FIXME(DAG): Should abe able to avoid extra register because first ; operation only clobbers relevant lane. -; GCN-LABEL: {{^}}v_mad_mix_v2f32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}} - -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]{{$}} - -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v2f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -103,19 +254,77 @@ ret <2 x half> %cvt.result } -; GCN-LABEL: {{^}}v_mad_mix_v3f32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v3f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v3f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -124,22 +333,96 @@ ret <3 x half> %cvt.result } -; GCN-LABEL: {{^}}v_mad_mix_v4f32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX906-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX9-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v4f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v7 +; GFX906-NEXT: v_mov_b32_e32 v1, v6 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v4f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v7 +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; VI-NEXT: v_mac_f32_e32 v11, v7, v9 +; VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; VI-NEXT: v_cvt_f16_f32_e32 v3, v4 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v4f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> @@ -148,17 +431,63 @@ ret <4 x half> %cvt.result } -; FIXME: Fold clamp -; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt: -; GFX900: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}} - -; GFX906: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp{{$}} +; FIXME (DAG): Fold clamp -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e64 v1, v2 clamp +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -169,22 +498,83 @@ ret <2 x half> %clamp } -; FIXME: Should be packed into 2 registers per argument? -; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt: -; GCN: s_waitcnt -; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-DAG: v_mad_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-DAG: v_mad_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] +; FIXME (DAG): Should be packed into 2 registers per argument? -; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-DAG: v_fma_mixhi_f16 v{{[0-9]+}}, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-DAG: v_fma_mixlo_f16 v{{[0-9]+}}, v1, v3, v5 op_sel_hi:[1,1,1] - - -; GFX9-DAG: v_pk_max_f16 v1, v1, v1 clamp -; GFX9: v_mov_b32_e32 v0, v{{[0-9]+}} -; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; VI-NEXT: v_mac_f32_e32 v8, v6, v7 +; VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp +; VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mac_f32_e32 v8, v2, v5 +; CI-NEXT: v_mac_f32_e32 v6, v0, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; CI-NEXT: v_mac_f32_e32 v7, v1, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v2, v0 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v7 +; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v6 +; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -195,24 +585,96 @@ ret <3 x half> %clamp } -; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp - - -; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp - - -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_mov_b32_e32 v0, v6 +; GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; VI-NEXT: v_mac_f32_e32 v10, v6, v8 +; VI-NEXT: v_mac_f32_e32 v11, v7, v9 +; VI-NEXT: v_mac_f32_e32 v4, v0, v2 +; VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp +; VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mac_f32_e32 v11, v3, v7 +; CI-NEXT: v_mac_f32_e32 v8, v0, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mac_f32_e32 v10, v2, v6 +; CI-NEXT: v_cvt_f32_f16_e64 v3, v0 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; CI-NEXT: v_mac_f32_e32 v9, v1, v5 +; CI-NEXT: v_cvt_f32_f16_e64 v2, v0 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v9 +; CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> @@ -223,17 +685,61 @@ ret <4 x half> %clamp } -; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_lo: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] - -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e64 v1, v2 clamp +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -246,17 +752,61 @@ ret <2 x half> %insert } -; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_postcvt_hi: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp - -; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp - -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mac_f32_e32 v5, v3, v4 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: v_cvt_f16_f32_sdwa v0, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mac_f32_e32 v4, v0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v4 +; CI-NEXT: v_mac_f32_e32 v5, v1, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -269,19 +819,67 @@ ret <2 x half> %insert } -; FIXME: Should be able to use mixlo/mixhi -; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: -; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; FIXME (DAG): Should be able to use mixlo/mixhi -; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp - -; GFX9: v_cvt_f16_f32_e32 v1, v3 -; GFX9: v_cvt_f16_f32_e32 v0, v0 -; GFX9: v_pack_b32_f16 v0, v0, v1 -; GFX9: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX906-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX900-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_clamp_precvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v3, v3, v4, v5 clamp +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_clamp_precvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -292,24 +890,85 @@ ret <2 x half> %cvt.result } -; FIXME: Handling undef 4th component -; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp - -; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp - +; FIXME (DAG): Handling undef 4th component -; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2 -; GFX9-NEXT: s_setpc_b64 define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX906-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX900-NEXT: v_pack_b32_f16 v0, v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v3f32_clamp_precvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; VI-NEXT: v_mad_f32 v6, v6, v7, v8 clamp +; VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; VI-NEXT: v_cvt_f16_f32_sdwa v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v3f32_clamp_precvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_mad_f32 v1, v1, v4, v7 clamp +; CI-NEXT: v_mad_f32 v2, v2, v5, v8 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v3, v6 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <3 x half> %src0 to <3 x float> %src1.ext = fpext <3 x half> %src1 to <3 x float> %src2.ext = fpext <3 x half> %src2 to <3 x float> @@ -320,23 +979,104 @@ ret <3 x half> %cvt.result } -; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt: -; GFX900: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp - - -; GFX906: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp - -; GFX9: v_cvt_f16_f32 -; GFX9: v_cvt_f16_f32 -; GFX9: v_cvt_f16_f32 -; GFX9: v_cvt_f16_f32 define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { +; GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX906-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX906-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX900-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX900-NEXT: v_pack_b32_f16 v0, v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v4f32_clamp_precvt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; VI-NEXT: v_mad_f32 v6, v6, v8, v10 clamp +; VI-NEXT: v_mad_f32 v7, v7, v9, v11 clamp +; VI-NEXT: v_mad_f32 v0, v0, v2, v4 clamp +; VI-NEXT: v_mad_f32 v1, v1, v3, v5 clamp +; VI-NEXT: v_cvt_f16_f32_sdwa v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_sdwa v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v4f32_clamp_precvt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_mad_f32 v1, v1, v5, v9 clamp +; CI-NEXT: v_mad_f32 v2, v2, v6, v10 clamp +; CI-NEXT: v_mad_f32 v3, v3, v7, v11 clamp +; CI-NEXT: v_mad_f32 v0, v0, v4, v8 clamp +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <4 x half> %src0 to <4 x float> %src1.ext = fpext <4 x half> %src1 to <4 x float> %src2.ext = fpext <4 x half> %src2 to <4 x float> diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1,14 +1,36 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX906,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s - -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo: -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding: [0x00,0x40,0xa0,0xd3,0x00,0x03,0x0a,0x1c] -; VI: v_mac_f32 -; CI: v_mad_f32 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CIVI,CI %s + define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -16,11 +38,39 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_int: -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding -; CIVI: v_mac_f32 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; CI-NEXT: v_mac_f32_e32 v0, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = lshr i32 %src0, 16 %src1.hi = lshr i32 %src1, 16 %src2.hi = lshr i32 %src2, 16 @@ -37,12 +87,33 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16hi_f16hi_f16hi_elt: -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding -; VI: v_mac_f32 -; CI: v_mad_f32 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v1, v3, v5 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 %src1.hi = extractelement <2 x half> %src1, i32 1 %src2.hi = extractelement <2 x half> %src2, i32 1 @@ -53,17 +124,55 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_v2f32: -; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v1, v3 - -; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX906-NEXT: v_mov_b32_e32 v1, v3 - -; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_v2f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v1, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; VI-NEXT: v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; VI-NEXT: v_mac_f32_e32 v1, v3, v5 +; VI-NEXT: v_mac_f32_e32 v0, v4, v6 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v6 +; CI-NEXT: v_mac_f32_e32 v3, v1, v5 +; CI-NEXT: v_mov_b32_e32 v1, v3 +; CI-NEXT: v_mac_f32_e32 v0, v4, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2.ext = fpext <2 x half> %src2 to <2 x float> @@ -71,20 +180,52 @@ ret <2 x float> %result } -; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: -; GCN: s_waitcnt -; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mov_b32_e32 v0, v3 -; GFX900-NEXT: s_setpc_b64 - -; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_mov_b32_e32 v0, v3 -; GFX906-NEXT: s_setpc_b64 - -; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_v2f32_shuffle: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_shuffle: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_shuffle: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mad_f32 v0, v3, v0, v2 +; VI-NEXT: v_mac_f32_e32 v2, v4, v1 +; VI-NEXT: v_mov_b32_e32 v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_shuffle: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; CI-NEXT: v_mad_f32 v0, v4, v2, v1 +; CI-NEXT: v_mac_f32_e32 v1, v5, v3 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.shuf = shufflevector <2 x half> %src0, <2 x half> undef, <2 x i32> %src1.shuf = shufflevector <2 x half> %src1, <2 x half> undef, <2 x i32> %src2.shuf = shufflevector <2 x half> %src2, <2 x half> undef, <2 x i32> @@ -95,17 +236,33 @@ ret <2 x float> %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_negf16lo_f16lo_f16lo: -; GFX900: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX900-NEXT: s_setpc_b64 - -; GFX906: s_waitcnt -; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX906-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -114,12 +271,33 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_absf16lo_f16lo_f16lo: -; GFX900: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] -; GFX906: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -128,17 +306,33 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_negabsf16lo_f16lo_f16lo: -; GFX900: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] -; GFX900-NEXT: s_setpc_b64 - -; GFX906: s_waitcnt -; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] -; GFX906-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, -|v0|, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -148,28 +342,64 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negf32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] ; encoding -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.neg = fneg float %src2 @@ -177,14 +407,32 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_absf32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] ; encoding -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, |v2| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, |v2| +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -192,14 +440,32 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_negabsf32: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] ; encoding -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, -|v2| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, -|v2| +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.abs = call float @llvm.fabs.f32(float %src2) @@ -212,28 +478,68 @@ ; f16 inline immediate that may be converted to f32, not an actual f32 ; inline immediate. -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imm1: -; GCN: s_waitcnt -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0 -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding - -; CIVI: v_mad_f32 v0, v0, v1, 1.0 -; GCN-NEXT: s_setpc_b64 define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 1.0 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 1.0 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, 1.0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: -; GCN: s_waitcnt -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494 -; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; VI: v_mad_f32 v0, v0, v1, 0.15915494 define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0.15915494 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0.15915494 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, 0.15915494 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000) @@ -245,13 +551,35 @@ ; imm value converted to f32. ; fpext f16 1/2pi = 0x3e230000 ; f32 1/2pi = 0x3e22f983 -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000 -; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; CIVI: v_madak_f32 v0, v0, v1, 0x3e230000 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x3e230000 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x3e230000 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2 = fpext half 0xH3118 to float @@ -259,13 +587,35 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x367c0000 -; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; CIVI: v_madak_f32 v0, v0, v1, 0x367c0000 define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x367c0000 +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x367c0000 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2 = fpext half 0xH003F to float @@ -273,33 +623,102 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 1.0 -; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mov_b32_e32 v1, v2 - -; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { +; GFX900-LABEL: v_mad_mix_v2f32_f32imm1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 1.0 +; GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_f32imm1: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 1.0 +; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_f32imm1: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mad_f32 v0, v0, v3, 1.0 +; VI-NEXT: v_mad_f32 v1, v2, v1, 1.0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_f32imm1: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mad_f32 v0, v0, v2, 1.0 +; CI-NEXT: v_mad_f32 v1, v1, v3, 1.0 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> ) ret <2 x float> %result } -; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0x3e230000 - -; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mov_b32_e32 v1, v2 - -; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { +; GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x3e230000 +; GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x3e230000 +; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3e230000 +; VI-NEXT: v_madak_f32 v0, v0, v3, 0x3e230000 +; VI-NEXT: v_mac_f32_e32 v1, v2, v4 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_mov_b32_e32 v1, 0x3e230000 +; CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e230000 +; CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2 = fpext <2 x half> to <2 x float> @@ -307,17 +726,51 @@ ret <2 x float> %result } -; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: -; GFX9: s_mov_b32 [[SREG:s[0-9]+]], 0.15915494 - -; GFX900: v_mad_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mov_b32_e32 v1, v2 - -; GFX906: v_fma_mix_f32 v2, v0, v1, [[SREG]] op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, [[SREG]] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { +; GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0.15915494 +; GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0.15915494 +; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0] +; GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mad_f32 v0, v0, v3, 0.15915494 +; VI-NEXT: v_mad_f32 v1, v2, v1, 0.15915494 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; CI-NEXT: v_mov_b32_e32 v1, 0x3e22f983 +; CI-NEXT: v_madak_f32 v0, v0, v2, 0x3e22f983 +; CI-NEXT: v_mac_f32_e32 v1, v4, v3 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> %src2 = fpext <2 x half> to <2 x float> @@ -325,11 +778,33 @@ ret <2 x float> %result } -; GCN-LABEL: {{^}}v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: -; GFX900: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; encoding -; CIVI: v_mad_f32 v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}}, v{{[0-9]}} clamp{{$}} define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { +; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp +; CI-NEXT: s_setpc_b64 s[30:31] %src0.hi = extractelement <2 x half> %src0, i32 1 %src1.hi = extractelement <2 x half> %src1, i32 1 %src2.hi = extractelement <2 x half> %src2, i32 1 @@ -342,37 +817,86 @@ ret float %clamp } -; GCN-LABEL: no_mix_simple: -; GCN: s_waitcnt -; GCN-NEXT: v_{{mad|fma}}_f32 v0, v0, v1, v2 -; GCN-NEXT: s_setpc_b64 define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { +; GFX900-LABEL: no_mix_simple: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: no_mix_simple: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; CIVI-LABEL: no_mix_simple: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CIVI-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) ret float %result } -; GCN-LABEL: no_mix_simple_fabs: -; GCN: s_waitcnt -; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 -; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2 -; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2 -; GCN-NEXT: s_setpc_b64 define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { +; GFX900-LABEL: no_mix_simple_fabs: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: no_mix_simple_fabs: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; CIVI-LABEL: no_mix_simple_fabs: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; CIVI-NEXT: s_setpc_b64 s[30:31] %src0.fabs = call float @llvm.fabs.f32(float %src0) %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2) ret float %result } -; FIXME: Should abe able to select in thits case +; FIXME(DAG): Should abe able to select in this case. ; All sources are converted from f16, so it doesn't matter ; v_mad_mix_f32 flushes. -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: -; GFX900: v_cvt_f32_f16 -; GFX900: v_cvt_f32_f16 -; GFX900: v_cvt_f32_f16 -; GFX900: v_fma_f32 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -380,27 +904,78 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals: -; GFX900: v_cvt_f32_f16 -; GFX900: v_cvt_f32_f16 -; GFX900: v_fma_f32 - -; GFX906-NOT: v_cvt_f32_f16 -; GFX906: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_fma_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; GFX9: v_cvt_f32_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_mul_f32 -; GFX9: v_add_f32 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX906-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX906-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX906-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX906-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -409,12 +984,40 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; GFX9: v_cvt_f32_f16 -; GFX9: v_cvt_f32_f16 -; GFX9: v_mul_f32 -; GFX9: v_add_f32 define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX906-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX906-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX906-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: v_add_f32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul float %src0.ext, %src1.ext @@ -422,12 +1025,33 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float @@ -436,12 +1060,32 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: -; GCN: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] ; encoding -; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mad_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float %mul = fmul contract float %src0.ext, %src1.ext @@ -449,32 +1093,83 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] ; encoding -; GFX9-NEXT: s_setpc_b64 - -; CIVI: v_mad_f32 define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 0 - %src0.neg = fsub half -0.0, %src0 + %src0.neg = fneg half %src0 %src0.ext = fpext half %src0.neg to float %src1.ext = fpext half %src1 to float %src2.ext = fpext half %src2 to float -; %src0.ext.neg = fsub float -0.0, %src0.ext +; %src0.ext.neg = fneg float %src0.ext %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) ret float %result } ; Make sure we don't fold pre-cvt fneg if we already have a fabs -; GCN-LABEL: {{^}}v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: -; GFX900: s_waitcnt + define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x8000 +; GFX900-NEXT: v_xor_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x8000 +; GFX906-NEXT: v_xor_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, |v0|, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 - %src0.neg = fsub half -0.0, %src0 + %src0.neg = fneg half %src0 %src0.ext = fpext half %src0.neg to float %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) %src1.ext = fpext half %src1 to float @@ -483,12 +1178,35 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 %src0.abs = call half @llvm.fabs.f16(half %src0) @@ -499,14 +1217,37 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> - %fneg = fsub <2 x half> , %src0.arg.bc + %fneg = fneg <2 x half> %src0.arg.bc %src0 = extractelement <2 x half> %fneg, i32 1 %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float @@ -515,12 +1256,35 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v3, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; VI-NEXT: v_mac_f32_e32 v0, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) %src0 = extractelement <2 x half> %fabs, i32 1 @@ -531,15 +1295,38 @@ ret float %result } -; GCN-LABEL: {{^}}v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: -; GFX9: s_waitcnt -; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] -; GFX9-NEXT: s_setpc_b64 define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 { +; GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mad_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; CI-NEXT: v_mad_f32 v0, v0, v1, v2 +; CI-NEXT: s_setpc_b64 s[30:31] %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) - %fneg.fabs = fsub <2 x half> , %fabs + %fneg.fabs = fneg <2 x half> %fabs %src0 = extractelement <2 x half> %fneg.fabs, i32 1 %src0.ext = fpext half %src0 to float %src1.ext = fpext half %src1 to float