diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -1,14 +1,126 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}mac_vvv: -; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 -; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] -; GCN: buffer_store_dword [[C]] +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + define amdgpu_kernel void @mac_vvv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mac_vvv: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mac_vvv: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-FLUSH-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mac_vvv: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mac_vvv: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mac_vvv: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -23,10 +135,69 @@ ret void } -; GCN-LABEL: {{^}}mad_inline_sgpr_inline: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 define amdgpu_kernel void @mad_inline_sgpr_inline(ptr addrspace(1) %out, float %in) #0 { +; SI-LABEL: mad_inline_sgpr_inline: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mad_f32 v0, s4, 0.5, 0.5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_inline_sgpr_inline: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, s4, 0.5, 0.5 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_inline_sgpr_inline: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mad_f32 v0, s4, 0.5, 0.5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_inline_sgpr_inline: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e64 v0, s2, 0.5 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, 0.5, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_inline_sgpr_inline: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_f32_e64 v0, s2, 0.5 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0.5, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %tmp0 = fmul float 0.5, %in %tmp1 = fadd float %tmp0, 0.5 @@ -34,10 +205,107 @@ ret void } -; GCN-LABEL: {{^}}mad_vvs: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @mad_vvs(ptr addrspace(1) %out, ptr addrspace(1) %in, float %c) #0 { +; SI-LABEL: mad_vvs: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, v0, v1, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_vvs: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s6 +; VI-FLUSH-NEXT: s_mov_b32 s9, s7 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, v0, v1, s12 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_vvs: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, v0, v1, s12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_vvs: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX11-FLUSH-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_vvs: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 @@ -50,9 +318,107 @@ ret void } -; GCN-LABEL: {{^}}mac_ssv: -; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @mac_ssv(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) #0 { +; SI-LABEL: mac_ssv: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mac_f32_e64 v0, s12, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mac_ssv: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s6 +; VI-FLUSH-NEXT: s_mov_b32 s9, s7 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f32_e64 v0, s12, s12 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mac_ssv: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f32_e64 v0, s12, s12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mac_ssv: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FLUSH-NEXT: v_mul_f32_e64 v1, s0, s0 +; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mac_ssv: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: v_mul_f32_e64 v1, s0, s0 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %c = load float, ptr addrspace(1) %in @@ -62,10 +428,147 @@ ret void } -; GCN-LABEL: {{^}}mac_mad_same_add: -; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mac_mad_same_add(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mac_mad_same_add: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mad_f32 v0, v0, v2, v1 +; SI-NEXT: v_mac_f32_e32 v1, v3, v4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mac_mad_same_add: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: v_mad_f32 v0, v0, v2, v1 +; VI-FLUSH-NEXT: v_mac_f32_e32 v1, v3, v4 +; VI-FLUSH-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mac_mad_same_add: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mad_f32 v0, v0, v2, v1 +; VI-NEXT: v_mac_f32_e32 v1, v3, v4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mac_mad_same_add: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v3, off, s[8:11], 0 offset:12 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v4, off, s[8:11], 0 offset:16 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_mul_f32 v1, v3, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mac_mad_same_add: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v3, off, s[8:11], 0 offset:12 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v4, off, s[8:11], 0 offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_mul_f32 v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -92,11 +595,103 @@ ; There is no advantage to using v_mac when one of the operands is negated ; and v_mad accepts more operand types. - -; GCN-LABEL: {{^}}mad_neg_src0: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @mad_neg_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mad_neg_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_neg_src0: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_neg_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_neg_src0: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_neg_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -113,10 +708,103 @@ ret void } -; GCN-LABEL: {{^}}nsz_mad_sub0_src0: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; SI-LABEL: nsz_mad_sub0_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: nsz_mad_sub0_src0: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: nsz_mad_sub0_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: nsz_mad_sub0_src0: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: nsz_mad_sub0_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -133,10 +821,109 @@ ret void } -; GCN-LABEL: {{^}}safe_mad_sub0_src0: -; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, -; GCN: v_ma{{[cd]}}_f32{{[_e32]*}} v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} define amdgpu_kernel void @safe_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: safe_mad_sub0_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: safe_mad_sub0_src0: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_sub_f32_e32 v0, 0, v0 +; VI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-FLUSH-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: safe_mad_sub0_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_f32_e32 v0, 0, v0 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: safe_mad_sub0_src0: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, 0, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: safe_mad_sub0_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_f32_e32 v0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -153,10 +940,103 @@ ret void } -; GCN-LABEL: {{^}}mad_neg_src1: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @mad_neg_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mad_neg_src1: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_neg_src1: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_neg_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_neg_src1: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_neg_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -173,10 +1053,103 @@ ret void } -; GCN-LABEL: {{^}}nsz_mad_sub0_src1: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; SI-LABEL: nsz_mad_sub0_src1: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: nsz_mad_sub0_src1: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: nsz_mad_sub0_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: nsz_mad_sub0_src1: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: nsz_mad_sub0_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -193,10 +1166,103 @@ ret void } -; GCN-LABEL: {{^}}mad_neg_src2: -; GCN-NOT: v_mac -; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} define amdgpu_kernel void @mad_neg_src2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mad_neg_src2: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_neg_src2: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, v0, v1, -v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_neg_src2: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_neg_src2: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_neg_src2: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -215,14 +1281,137 @@ ; Without special casing the inline constant check for v_mac_f32's ; src2, this fails to fold the 1.0 into a mad. - -; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32: -; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] - -; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] -; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #3 { +; SI-LABEL: fold_inline_imm_into_mac_src2_f32: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_add_f32_e32 v4, v2, v2 +; SI-NEXT: v_add_f32_e32 v5, v3, v3 +; SI-NEXT: v_mad_f32 v4, v4, -4.0, 1.0 +; SI-NEXT: v_add_f32_e32 v3, v4, v3 +; SI-NEXT: v_mad_f32 v2, -v5, v2, 1.0 +; SI-NEXT: v_mac_f32_e32 v3, 0x41000000, v2 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f32: +; VI-FLUSH: ; %bb.0: ; %bb +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-FLUSH-NEXT: flat_load_dword v5, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_dword v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_f32_e32 v3, v5, v5 +; VI-FLUSH-NEXT: v_add_f32_e32 v4, v2, v2 +; VI-FLUSH-NEXT: v_mad_f32 v3, v3, -4.0, 1.0 +; VI-FLUSH-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-FLUSH-NEXT: v_mad_f32 v3, -v4, v5, 1.0 +; VI-FLUSH-NEXT: v_mac_f32_e32 v2, 0x41000000, v3 +; VI-FLUSH-NEXT: flat_store_dword v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: fold_inline_imm_into_mac_src2_f32: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v3, v5, v5 +; VI-NEXT: v_add_f32_e32 v4, v2, v2 +; VI-NEXT: v_mad_f32 v3, v3, -4.0, 1.0 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_mad_f32 v3, -v4, v5, 1.0 +; VI-NEXT: v_mac_f32_e32 v2, 0x41000000, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f32: +; GFX11-FLUSH: ; %bb.0: ; %bb +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_dual_add_f32 v3, v1, v1 :: v_dual_add_f32 v4, v2, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v1, v4, v1 :: v_dual_add_f32 v2, v3, v2 +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v1, 0x41000000, v1 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: fold_inline_imm_into_mac_src2_f32: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v3, v1, v1 :: v_dual_add_f32 v4, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v1, v4, v1 :: v_dual_add_f32 v2, v3, v2 +; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, 0x41000000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -244,20 +1433,149 @@ ret void } -; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] - -; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] -; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} - -; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] -; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #3 { +; SI-LABEL: fold_inline_imm_into_mac_src2_f16: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, v2, v2 +; SI-NEXT: v_add_f32_e32 v5, v3, v3 +; SI-NEXT: v_mad_f32 v4, v4, -4.0, 1.0 +; SI-NEXT: v_add_f32_e32 v3, v4, v3 +; SI-NEXT: v_mad_f32 v2, -v5, v2, 1.0 +; SI-NEXT: v_mac_f32_e32 v3, 0x41000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f16: +; VI-FLUSH: ; %bb.0: ; %bb +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-FLUSH-NEXT: flat_load_ushort v5, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_f16_e32 v3, v5, v5 +; VI-FLUSH-NEXT: v_add_f16_e32 v4, v2, v2 +; VI-FLUSH-NEXT: v_mad_f16 v3, v3, -4.0, 1.0 +; VI-FLUSH-NEXT: v_add_f16_e32 v2, v3, v2 +; VI-FLUSH-NEXT: v_mad_f16 v3, -v4, v5, 1.0 +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 0x4800, v3 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: fold_inline_imm_into_mac_src2_f16: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f16_e32 v3, v5, v5 +; VI-NEXT: v_add_f16_e32 v4, v2, v2 +; VI-NEXT: v_mul_f16_e32 v3, -4.0, v3 +; VI-NEXT: v_mul_f16_e32 v4, v4, v5 +; VI-NEXT: v_add_f16_e32 v3, 1.0, v3 +; VI-NEXT: v_sub_f16_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f16_e32 v2, v3, v2 +; VI-NEXT: v_mul_f16_e32 v3, 0x4800, v4 +; VI-NEXT: v_add_f16_e32 v2, v2, v3 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f16: +; GFX11-FLUSH: ; %bb.0: ; %bb +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v3, v1, v1 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v4, v2, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v3, -4.0, v3 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v4, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v3, 1.0, v3 +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, 1.0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, v3, v2 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, 0x4800, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: fold_inline_imm_into_mac_src2_f16: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v3, v1, v1 +; GFX11-NEXT: v_add_f16_e32 v4, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f16_e32 v3, -4.0, v3 +; GFX11-NEXT: v_mul_f16_e32 v1, v4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v3, 1.0, v3 +; GFX11-NEXT: v_sub_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_mul_f16_e32 v1, 0x4800, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,20 +1,102 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s - -; GCN-LABEL: {{^}}mac_f16: -; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] -; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] -; VI: buffer_store_short v[[C_F16]] -; GCN: s_endpgm +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s + define amdgpu_kernel void @mac_f16( +; SI-LABEL: mac_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -31,14 +113,153 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_same_add: -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} - -; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_same_add( +; SI-LABEL: mac_f16_same_add: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s20, s8 +; SI-NEXT: s_mov_b32 s21, s9 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s24, s12 +; SI-NEXT: s_mov_b32 s25, s13 +; SI-NEXT: s_mov_b32 s26, s2 +; SI-NEXT: s_mov_b32 s27, s3 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v3, off, s[24:27], 0 +; SI-NEXT: buffer_load_ushort v4, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_mad_f32 v1, v1, v2, v3 +; SI-NEXT: v_mac_f32_e32 v3, v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_same_add: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x54 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s24, s12 +; VI-NEXT: s_mov_b32 s25, s13 +; VI-NEXT: s_mov_b32 s26, s2 +; VI-NEXT: s_mov_b32 s27, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[20:23], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[24:27], 0 +; VI-NEXT: buffer_load_ushort v3, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v4, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mad_f16 v0, v0, v1, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v4, v3 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_same_add: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[16:17], s[0:1], 0x54 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_mov_b32 s26, s2 +; GFX11-NEXT: s_mov_b32 s27, s3 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s30, s2 +; GFX11-NEXT: s_mov_b32 s31, s3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s20, s8 +; GFX11-NEXT: s_mov_b32 s21, s9 +; GFX11-NEXT: s_mov_b32 s24, s10 +; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: s_mov_b32 s36, s14 +; GFX11-NEXT: s_mov_b32 s37, s15 +; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 +; GFX11-NEXT: buffer_load_u16 v3, off, s[36:39], 0 +; GFX11-NEXT: s_mov_b32 s28, s12 +; GFX11-NEXT: s_mov_b32 s29, s13 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: buffer_load_u16 v4, off, s[28:31], 0 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v1, v1, v4 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v4 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, @@ -64,16 +285,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a( +; SI-LABEL: mac_f16_neg_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -91,16 +396,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b( +; SI-LABEL: mac_f16_neg_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -118,16 +507,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c: -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_c( +; SI-LABEL: mac_f16_neg_c: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_c: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, v0, v1, -v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -145,13 +618,105 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( +; SI-LABEL: mac_f16_neg_a_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v0, 0, v0 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_a_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v0, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_a_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_sub_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -169,13 +734,105 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] -; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( +; SI-LABEL: mac_f16_neg_b_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v1, 0, v1 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_b_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_sub_f16_e32 v1, 0, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_b_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s16, s2 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_sub_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -193,13 +850,104 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math( +; SI-LABEL: mac_f16_neg_c_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v1, 0, v1 +; SI-NEXT: v_mac_f32_e32 v1, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_c_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_sub_f16_e32 v1, 0, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v1, v0, v2 +; VI-NEXT: buffer_store_short v1, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_c_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f16_e32 v1, 0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -217,16 +965,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math( +; SI-LABEL: mac_f16_neg_a_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_a_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_a_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -244,16 +1076,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math( +; SI-LABEL: mac_f16_neg_b_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_b_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_b_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -271,16 +1187,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math( +; SI-LABEL: mac_f16_neg_c_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_c_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, v0, v1, -v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_c_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -298,41 +1298,123 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16: -; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] - -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] - -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - -; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] -; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] - -; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16( +; SI-LABEL: mac_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_barrier +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_barrier +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_barrier +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_barrier +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_mac_f16_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v2, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_barrier +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_barrier +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -351,19 +1433,179 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_same_add: -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_same_add( +; SI-LABEL: mac_v2f16_same_add: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s20, s8 +; SI-NEXT: s_mov_b32 s21, s9 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s24, s12 +; SI-NEXT: s_mov_b32 s25, s13 +; SI-NEXT: s_mov_b32 s26, s2 +; SI-NEXT: s_mov_b32 s27, s3 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: buffer_load_dword v3, off, s[24:27], 0 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_mad_f32 v6, v6, v7, v8 +; SI-NEXT: v_mad_f32 v1, v1, v2, v3 +; SI-NEXT: v_mac_f32_e32 v8, v9, v5 +; SI-NEXT: v_mac_f32_e32 v3, v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_same_add: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x54 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s24, s12 +; VI-NEXT: s_mov_b32 s25, s13 +; VI-NEXT: s_mov_b32 s26, s2 +; VI-NEXT: s_mov_b32 s27, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v2, off, s[24:27], 0 +; VI-NEXT: buffer_load_dword v3, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mac_f16_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mad_f16 v6, v0, v1, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; VI-NEXT: v_mac_f16_e32 v2, v4, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_same_add: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[16:17], s[0:1], 0x54 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_mov_b32 s26, s2 +; GFX11-NEXT: s_mov_b32 s27, s3 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s30, s2 +; GFX11-NEXT: s_mov_b32 s31, s3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s20, s8 +; GFX11-NEXT: s_mov_b32 s21, s9 +; GFX11-NEXT: s_mov_b32 s24, s10 +; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: s_mov_b32 s36, s14 +; GFX11-NEXT: s_mov_b32 s37, s15 +; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[24:27], 0 +; GFX11-NEXT: buffer_load_b32 v3, off, s[36:39], 0 +; GFX11-NEXT: s_mov_b32 s28, s12 +; GFX11-NEXT: s_mov_b32 s29, s13 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: buffer_load_b32 v4, off, s[28:31], 0 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_pk_mul_f16 v1, v1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_f16 v1, v1, v4 +; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, @@ -389,18 +1631,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} - -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a( +; SI-LABEL: mac_v2f16_neg_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -418,18 +1760,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b( +; SI-LABEL: mac_v2f16_neg_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -447,22 +1889,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c( +; SI-LABEL: mac_v2f16_neg_c: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v4, -v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, v3, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_c: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, v4, v3, -v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, v1, v0, -v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -480,21 +2018,123 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} - -; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( +; SI-LABEL: mac_v2f16_neg_a_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v3, 0, v3 +; SI-NEXT: v_sub_f32_e32 v0, 0, v0 +; SI-NEXT: v_mac_f32_e32 v5, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_mac_f32_e32 v2, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_a_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v4, 0, v0 +; VI-NEXT: v_sub_f16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_mac_f16_e32 v1, v4, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_a_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 0 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -512,21 +2152,123 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] - -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( +; SI-LABEL: mac_v2f16_neg_b_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v4, 0, v4 +; SI-NEXT: v_sub_f32_e32 v1, 0, v1 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_b_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v4, 0, v0 +; VI-NEXT: v_sub_f16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_mac_f16_e32 v1, v2, v4 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_b_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s16, s2 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 0 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -544,21 +2286,119 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} - -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( +; SI-LABEL: mac_v2f16_neg_c_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v5, 0, v5 +; SI-NEXT: v_sub_f32_e32 v2, 0, v2 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_c_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v4, 0, v0 +; VI-NEXT: v_sub_f16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_mac_f16_e32 v4, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_c_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_add_f16 v1, v1, 0 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -576,22 +2416,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math( +; SI-LABEL: mac_v2f16_neg_a_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_a_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_a_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -609,22 +2545,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math( +; SI-LABEL: mac_v2f16_neg_b_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_b_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_b_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -642,22 +2674,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math( +; SI-LABEL: mac_v2f16_neg_c_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v4, -v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, v3, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_c_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, v4, v3, -v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, v1, v0, -v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_c_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b,