diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1,44 +1,148 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,CI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}void_func_i1: -; GCN: v_and_b32_e32 v0, 1, v0 -; GCN: buffer_store_byte v0, off define void @void_func_i1(i1 %arg0) #0 { +; CIGFX89-LABEL: void_func_i1: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i1 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i1_zeroext: -; GCN: s_waitcnt -; GCN-NEXT: v_or_b32_e32 v0, 12, v0 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 { +; CIGFX89-LABEL: void_func_i1_zeroext: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_or_b32_e32 v0, 12, v0 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i1_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = zext i1 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i1_signext: -; GCN: s_waitcnt -; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_i1_signext(i1 signext %arg0) #0 { +; CI-LABEL: void_func_i1_signext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i1_signext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i1_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i1_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = sext i1 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}i1_arg_i1_use: -; GCN: v_and_b32_e32 v0, 1, v0 -; GCN: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1 define void @i1_arg_i1_use(i1 %arg) #0 { +; CIGFX89-LABEL: i1_arg_i1_use: +; CIGFX89: ; %bb.0: ; %bb +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 +; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1 +; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; CIGFX89-NEXT: s_cbranch_execz .LBB3_2 +; CIGFX89-NEXT: ; %bb.1: ; %bb1 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: v_mov_b32_e32 v0, 0 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: .LBB3_2: ; %bb2 +; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_arg_i1_use: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1 +; GFX11-NEXT: s_and_saveexec_b32 s0, s1 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %bb1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB3_2: ; %bb2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: br i1 %arg, label %bb2, label %bb1 @@ -50,304 +154,1139 @@ ret void } -; GCN-LABEL: {{^}}void_func_i8: -; GCN-NOT: v0 -; GCN: buffer_store_byte v0, off define void @void_func_i8(i8 %arg0) #0 { +; CIGFX89-LABEL: void_func_i8: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i8 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i8_zeroext: -; GCN-NOT: and_b32 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 { +; CI-LABEL: void_func_i8_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i8_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i8_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i8_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = zext i8 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i8_signext: -; GCN-NOT: v_bfe_i32 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i8_signext(i8 signext %arg0) #0 { +; CI-LABEL: void_func_i8_signext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i8_signext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i8_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i8_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = sext i8 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i16: -; GCN: buffer_store_short v0, off define void @void_func_i16(i16 %arg0) #0 { +; CIGFX89-LABEL: void_func_i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i16 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i16_zeroext: -; GCN-NOT: v0 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 { +; CI-LABEL: void_func_i16_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i16_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i16_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i16_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = zext i16 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i16_signext: -; GCN-NOT: v0 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i16_signext(i16 signext %arg0) #0 { +; CI-LABEL: void_func_i16_signext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i16_signext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i16_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i16_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = sext i16 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i32: -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_i32(i32 %arg0) #0 { +; CIGFX89-LABEL: void_func_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i32 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i64: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_i64(i64 %arg0) #0 { +; CIGFX89-LABEL: void_func_i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i64 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_f16: -; VI-NOT: v0 -; CI: v_cvt_f16_f32_e32 v0, v0 -; GCN: buffer_store_short v0, off define void @void_func_f16(half %arg0) #0 { +; CI-LABEL: void_func_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store half %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_f32 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_f32(float %arg0) #0 { +; CIGFX89-LABEL: void_func_f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store float %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_f64: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_f64(double %arg0) #0 { +; CIGFX89-LABEL: void_func_f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store double %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i32: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_v2i32(<2 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3i32: -; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3i32(<3 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4i32: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v4i32(<4 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v5i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dword v4, off define void @void_func_v5i32(<5 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v5i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <5 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v8i32(<8 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v16i32(<16 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off define void @void_func_v32i32(<32 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v32i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(6) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <32 x i32> %arg0, ptr addrspace(1) undef ret void } ; 1 over register limit -; GCN-LABEL: {{^}}void_func_v33i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN: buffer_store_dword [[STACKLOAD]], off define void @void_func_v33i32(<33 x i32> %arg0) #0 { +; CI-LABEL: void_func_v33i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v33i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v33i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v33i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b32 v32, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <33 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i64: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v2i64(<2 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx2 v[4:5], off define void @void_func_v3i64(<3 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v4i64(<4 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v5i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx2 v[8:9], off define void @void_func_v5i64(<5 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v5i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <5 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v8i64(<8 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off define void @void_func_v16i64(<16 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(6) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i16: -; GFX9-NOT: v0 -; GFX9: buffer_store_dword v0, off define void @void_func_v2i16(<2 x i16> %arg0) #0 { +; CI-LABEL: void_func_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v2i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3i16: -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off define void @void_func_v3i16(<3 x i16> %arg0) #0 { +; CI-LABEL: void_func_v3i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v3i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4i16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: buffer_store_dwordx2 v[0:1], off define void @void_func_v4i16(<4 x i16> %arg0) #0 { +; CI-LABEL: void_func_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_or_b32_e32 v1, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v4i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v5i16: -; CI: v_lshlrev_b32 -; CI: v_and_b32 -; CI: v_lshlrev_b32 -; CI: v_or_b32 -; CI: v_or_b32 -; CI-DAG: buffer_store_short v -; CI-DAG: buffer_store_dwordx2 v - -; GFX89-DAG: buffer_store_short v2, off, -; GFX89-DAG: buffer_store_dwordx2 v[0:1], off - define void @void_func_v5i16(<5 x i16> %arg0) #0 { +; CI-LABEL: void_func_v5i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_or_b32_e32 v1, v0, v1 +; CI-NEXT: buffer_store_short v4, off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v5i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v2, off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v5i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v2, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <5 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8i16: -; GFX9-DAG: buffer_store_dwordx4 v[0:3], off define void @void_func_v8i16(<8 x i16> %arg0) #0 { +; CI-LABEL: void_func_v8i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_or_b32_e32 v5, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v2, v3 +; CI-NEXT: v_or_b32_e32 v3, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v8i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16i16: -; GFX9-DAG: buffer_store_dwordx4 v[0:3], off -; GFX9-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v16i16(<16 x i16> %arg0) #0 { +; CI-LABEL: void_func_v16i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v5, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v2, v3 +; CI-NEXT: v_or_b32_e32 v3, v0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; CI-NEXT: v_or_b32_e32 v14, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; CI-NEXT: v_or_b32_e32 v13, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; CI-NEXT: v_or_b32_e32 v12, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; CI-NEXT: v_or_b32_e32 v11, v1, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v16i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i24: -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 define void @void_func_v2i24(<2 x i24> %arg0) #0 { +; CI-LABEL: void_func_v2i24: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v2i24: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %elt0 = extractelement <2 x i24> %arg0, i32 0 %elt1 = extractelement <2 x i24> %arg0, i32 1 %add = add i24 %elt0, %elt1 @@ -355,197 +1294,734 @@ ret void } -; GCN-LABEL: {{^}}void_func_v2f32: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_v2f32(<2 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3f32: -; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3f32(<3 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4f32: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v4f32(<4 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8f32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v8f32(<8 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16f32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v16f32(<16 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2f64: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v2f64(<2 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx2 v[4:5], off define void @void_func_v3f64(<3 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v4f64(<4 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v8f64(<8 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off define void @void_func_v16f64(<16 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(6) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2f16: -; GFX9-NOT: v0 -; GFX9: buffer_store_dword v0, off define void @void_func_v2f16(<2 x half> %arg0) #0 { +; CI-LABEL: void_func_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v2f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x half> %arg0, ptr addrspace(1) undef ret void } ; FIXME: Different abi if f16 legal -; GCN-LABEL: {{^}}void_func_v3f16: -; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v0 -; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v1 -; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v2 - -; GFX89-DAG: v0 -; GFX89-DAG: v1 - -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_dword define void @void_func_v3f16(<3 x half> %arg0) #0 { +; CI-LABEL: void_func_v3f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v3f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x half> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4f16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9-NOT: v[0:1] -; GFX9: buffer_store_dwordx2 v[0:1], off define void @void_func_v4f16(<4 x half> %arg0) #0 { +; CI-LABEL: void_func_v4f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v4f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x half> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8f16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: buffer_store_dwordx4 v[0:3], off define void @void_func_v8f16(<8 x half> %arg0) #0 { +; CI-LABEL: void_func_v8f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_or_b32_e32 v3, v2, v3 +; CI-NEXT: v_or_b32_e32 v2, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v8f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x half> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16f16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9-DAG: buffer_store_dwordx4 v[0:3], off -; GFX9-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v16f16(<16 x half> %arg0) #0 { +; CI-LABEL: void_func_v16f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; CI-NEXT: v_or_b32_e32 v3, v2, v3 +; CI-NEXT: v_or_b32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v13, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; CI-NEXT: v_or_b32_e32 v12, v7, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v11, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; CI-NEXT: v_or_b32_e32 v10, v7, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v16f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x half> %arg0, ptr addrspace(1) undef ret void } ; Make sure there is no alignment requirement for passed vgprs. -; GCN-LABEL: {{^}}void_func_i32_i64_i32: -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off -; GCN: buffer_store_dwordx2 v[1:2] -; GCN: buffer_store_dword v3 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 { +; CIGFX89-LABEL: void_func_i32_i64_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dword v3, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i32_i64_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile i32 %arg0, ptr addrspace(1) undef store volatile i64 %arg1, ptr addrspace(1) undef store volatile i32 %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_struct_i32: -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_struct_i32({ i32 } %arg0) #0 { +; CIGFX89-LABEL: void_func_struct_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_struct_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store { i32 } %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_struct_i8_i32: -; GCN-DAG: buffer_store_byte v0, off -; GCN-DAG: buffer_store_dword v1, off define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 { +; CIGFX89-LABEL: void_func_struct_i8_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store { i8, i32 } %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: -; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword v[[ELT1]] -; GCN-DAG: buffer_store_byte v[[ELT0]] define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 { +; CIGFX89-LABEL: void_func_byval_struct_i8_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(1) +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(1) +; CIGFX89-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u8 v1, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0 store { i8, i32 } %arg0.load, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: -; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}} -; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}} -; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}} -; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}} - -; GCN: ds_write_b32 v0, v0 -; GCN: s_setpc_b64 define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }) %arg0, ptr addrspace(5) byval({ i8, i32 }) %arg1, i32 %arg2) #0 { +; CI-LABEL: void_func_byval_struct_i8_i32_x2: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_byval_struct_i8_i32_x2: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ds_write_b32 v0, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_byval_struct_i8_i32_x2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_byval_struct_i8_i32_x2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_u8 v1, off, s32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_u8 v3, off, s32 offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:12 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v3, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0 %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1 store volatile { i8, i32 } %arg0.load, ptr addrspace(1) undef @@ -554,13 +2030,37 @@ ret void } -; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: -; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off -; GCN-DAG: buffer_store_dwordx2 v[[[ARG1_LOAD0]]:[[ARG1_LOAD1]]], off define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, ptr addrspace(5) byval(i64) %arg1) #0 { +; CIGFX89-LABEL: void_func_byval_i32_byval_i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(2) +; CIGFX89-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(1) +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_byval_i32_byval_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v2, off, s32 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.load = load i32, ptr addrspace(5) %arg0 %arg1.load = load i64, ptr addrspace(5) %arg1 store i32 %arg0.load, ptr addrspace(1) undef @@ -568,23 +2068,139 @@ ret void } -; GCN-LABEL: {{^}}void_func_v32i32_i32_i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN-DAG: buffer_load_dword v[[LOAD_ARG0_31:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12 - -; GCN: buffer_store_dword v[[LOAD_ARG1]] -; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { +; CI-LABEL: void_func_v32i32_i32_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_i32_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_i32_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_i32_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile i32 %arg1, ptr addrspace(1) undef store volatile i64 %arg2, ptr addrspace(1) undef @@ -592,26 +2208,167 @@ } ; FIXME: Different ext load types on CI vs. VI -; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: -; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] -; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] - -; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off -; GCN: buffer_store_byte [[LOAD_ARG2]], off -; GCN: buffer_store_short [[LOAD_ARG3]], off -; GFX89: buffer_store_short [[LOAD_ARG4]], off - -; CI: buffer_store_short [[CVT_ARG4]], off define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 { +; CI-LABEL: void_func_v32i32_i1_i8_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; CI-NEXT: v_and_b32_e32 v0, 1, v16 +; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_i1_i8_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 1, v20 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_i1_i8_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_i1_i8_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:16 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_and_b32_e32 v16, 1, v32 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile i1 %arg1, ptr addrspace(1) undef store volatile i8 %arg2, ptr addrspace(1) undef @@ -620,138 +2377,1136 @@ ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN: buffer_store_dwordx2 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]], off -; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v2i32_v2f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i32_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i32_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i32_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b64 v[34:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i32> %arg1, ptr addrspace(1) undef store volatile <2 x float> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16: -; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GFX9: buffer_store_dword [[LOAD_ARG1]], off -; GFX9: buffer_store_short [[LOAD_ARG2]], off define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v2i16_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i16_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i16_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i16_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v32, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b32 v33, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i16> %arg1, ptr addrspace(1) undef store volatile <2 x half> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v2i64_v2f64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i64_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i64> %arg1, ptr addrspace(1) undef store volatile <2 x double> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v4i32_v4f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v4i32_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v4i32_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v4i32_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <4 x i32> %arg1, ptr addrspace(1) undef store volatile <4 x float> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}} - -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v8i32_v8f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v8i32_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v8i32_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v8i32_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x10 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:36 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <8 x i32> %arg1, ptr addrspace(1) undef store volatile <8 x float> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:60{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:64{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:68{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:72{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:76{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:80{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:84{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:88{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:92{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:96{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:100{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:104{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:108{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:112{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:116{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}} define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v16i32_v16f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v16i32_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v16i32_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v16i32_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x20 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:68 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <16 x i32> %arg1, ptr addrspace(1) undef store volatile <16 x float> %arg2, ptr addrspace(1) undef @@ -759,15 +3514,49 @@ } ; Make sure v3 isn't a wasted register because of v3 types being promoted to v4 -; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg: -; GCN: s_waitcnt -; GCN: ds_write_b32 v{{[0-9]+}}, v0 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 { +; CI-LABEL: void_func_v3f32_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: ds_write_b32 v0, v2 +; CI-NEXT: ds_write_b32 v0, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v3f32_wasted_reg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_write_b32 v0, v0 +; VI-NEXT: ds_write_b32 v0, v1 +; VI-NEXT: ds_write_b32 v0, v2 +; VI-NEXT: ds_write_b32 v0, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v3f32_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: ds_write_b32 v0, v2 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f32_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: ds_store_b32 v0, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.0 = extractelement <3 x float> %arg0, i32 0 %arg0.1 = extractelement <3 x float> %arg0, i32 1 %arg0.2 = extractelement <3 x float> %arg0, i32 2 @@ -778,15 +3567,49 @@ ret void } -; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg: -; GCN: s_waitcnt -; GCN: ds_write_b32 v{{[0-9]+}}, v0 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 { +; CI-LABEL: void_func_v3i32_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: ds_write_b32 v0, v2 +; CI-NEXT: ds_write_b32 v0, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v3i32_wasted_reg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_write_b32 v0, v0 +; VI-NEXT: ds_write_b32 v0, v1 +; VI-NEXT: ds_write_b32 v0, v2 +; VI-NEXT: ds_write_b32 v0, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v3i32_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: ds_write_b32 v0, v2 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i32_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: ds_store_b32 v0, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.0 = extractelement <3 x i32> %arg0, i32 0 %arg0.1 = extractelement <3 x i32> %arg0, i32 1 %arg0.2 = extractelement <3 x i32> %arg0, i32 2 @@ -798,15 +3621,404 @@ } ; Check there is no crash. -; GCN-LABEL: {{^}}void_func_v16i8: define void @void_func_v16i8(<16 x i8> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16i8: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v7, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v2, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b8 v15, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v14, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v13, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v12, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v11, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v10, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v9, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v8, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v7, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v6, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v5, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v4, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v3, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v2, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <16 x i8> %arg0, ptr addrspace(1) undef ret void } ; Check there is no crash. -; GCN-LABEL: {{^}}void_func_v32i32_v16i8: define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { +; CI-LABEL: void_func_v32i32_v16i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x10 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u8 v33, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u8 v34, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u8 v35, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u8 v36, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u8 v37, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u8 v38, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u8 v39, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u8 v48, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u8 v49, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u8 v50, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u8 v51, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u8 v52, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u8 v53, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u8 v54, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u8 v55, off, s32 offset:4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <16 x i8> %arg1, ptr addrspace(1) undef ret void diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1,464 +1,1345 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s - -; GCN-LABEL: {{^}}i1_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX8 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s + define i1 @i1_func_void() #0 { +; GFX789-LABEL: i1_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) undef ret i1 %val } ; FIXME: Missing and? -; GCN-LABEL: {{^}}i1_zeroext_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i1 @i1_zeroext_func_void() #0 { +; GFX789-LABEL: i1_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) undef ret i1 %val } -; GCN-LABEL: {{^}}i1_signext_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}} -; GCN-NEXT: s_setpc_b64 define signext i1 @i1_signext_func_void() #0 { +; GFX789-LABEL: i1_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) undef ret i1 %val } -; GCN-LABEL: {{^}}i8_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i8 @i8_func_void() #0 { +; GFX789-LABEL: i8_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i8_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i8, ptr addrspace(1) undef ret i8 %val } -; GCN-LABEL: {{^}}i8_zeroext_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i8 @i8_zeroext_func_void() #0 { +; GFX789-LABEL: i8_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i8_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i8, ptr addrspace(1) undef ret i8 %val } -; GCN-LABEL: {{^}}i8_signext_func_void: -; GCN: buffer_load_sbyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define signext i8 @i8_signext_func_void() #0 { +; GFX789-LABEL: i8_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i8_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i8, ptr addrspace(1) undef ret i8 %val } -; GCN-LABEL: {{^}}i16_func_void: -; GCN: buffer_load_ushort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @i16_func_void() #0 { +; GFX789-LABEL: i16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) undef ret i16 %val } -; GCN-LABEL: {{^}}i16_zeroext_func_void: -; GCN: buffer_load_ushort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i16 @i16_zeroext_func_void() #0 { +; GFX789-LABEL: i16_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) undef ret i16 %val } -; GCN-LABEL: {{^}}i16_signext_func_void: -; GCN: buffer_load_sshort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define signext i16 @i16_signext_func_void() #0 { +; GFX789-LABEL: i16_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_sshort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) undef ret i16 %val } -; GCN-LABEL: {{^}}i32_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i32 @i32_func_void() #0 { +; GFX789-LABEL: i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i32, ptr addrspace(1) undef ret i32 %val } -; GCN-LABEL: {{^}}i48_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: buffer_load_ushort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i48 @i48_func_void() #0 { +; GFX789-LABEL: i48_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i48_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i48, ptr addrspace(1) undef, align 8 ret i48 %val } -; GCN-LABEL: {{^}}i48_zeroext_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: buffer_load_ushort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i48 @i48_zeroext_func_void() #0 { +; GFX789-LABEL: i48_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i48_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i48, ptr addrspace(1) undef, align 8 ret i48 %val } -; GCN-LABEL: {{^}}i48_signext_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: buffer_load_sshort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define signext i48 @i48_signext_func_void() #0 { +; GFX789-LABEL: i48_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_sshort v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i48_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_i16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i48, ptr addrspace(1) undef, align 8 ret i48 %val } -; GCN-LABEL: {{^}}i63_func_void: -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 define i63 @i63_func_void(i63 %val) #0 { +; GFX789-LABEL: i63_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i63_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] ret i63 %val } -; GCN-LABEL: {{^}}i63_zeroext_func_void: -; GCN: s_waitcnt -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GCN-NEXT: s_setpc_b64 define zeroext i63 @i63_zeroext_func_void(i63 %val) #0 { +; GFX789-LABEL: i63_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i63_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] ret i63 %val } -; GCN-LABEL: {{^}}i63_signext_func_void: -; GCN: s_waitcnt -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; CI-NEXT: v_ashr_i64 v[0:1], v[0:1], 1 - -; GFX89-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX89-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] - -; GCN-NEXT: s_setpc_b64 define signext i63 @i63_signext_func_void(i63 %val) #0 { +; CI-LABEL: i63_signext_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; CI-NEXT: v_ashr_i64 v[0:1], v[0:1], 1 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: i63_signext_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX89-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i63_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] ret i63 %val } -; GCN-LABEL: {{^}}i64_func_void: -; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i64 @i64_func_void() #0 { +; GFX789-LABEL: i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i64, ptr addrspace(1) undef ret i64 %val } -; GCN-LABEL: {{^}}i65_func_void: -; GCN-DAG: buffer_load_dwordx2 v[0:1], off -; GCN-DAG: buffer_load_ubyte v2, off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i65 @i65_func_void() #0 { +; GFX789-LABEL: i65_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i65_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_u8 v2, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i65, ptr addrspace(1) undef ret i65 %val } -; GCN-LABEL: {{^}}f32_func_void: -; GCN: buffer_load_dword v0, off, s[4:7], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define float @f32_func_void() #0 { +; GFX789-LABEL: f32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: f32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load float, ptr addrspace(1) undef ret float %val } -; GCN-LABEL: {{^}}f64_func_void: -; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define double @f64_func_void() #0 { +; GFX789-LABEL: f64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: f64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load double, ptr addrspace(1) undef ret double %val } -; GCN-LABEL: {{^}}v2f64_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <2 x double> @v2f64_func_void() #0 { +; GFX789-LABEL: v2f64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2f64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x double>, ptr addrspace(1) undef ret <2 x double> %val } -; GCN-LABEL: {{^}}v2i32_func_void: -; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <2 x i32> @v2i32_func_void() #0 { +; GFX789-LABEL: v2i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i32>, ptr addrspace(1) undef ret <2 x i32> %val } -; GCN-LABEL: {{^}}v3i32_func_void: -; GCN: buffer_load_dwordx3 v[0:2], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <3 x i32> @v3i32_func_void() #0 { +; GFX789-LABEL: v3i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i32>, ptr addrspace(1) undef ret <3 x i32> %val } -; GCN-LABEL: {{^}}v4i32_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <4 x i32> @v4i32_func_void() #0 { +; GFX789-LABEL: v4i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i32>, ptr addrspace(1) undef ret <4 x i32> %val } -; GCN-LABEL: {{^}}v5i32_func_void: -; GCN-DAG: buffer_load_dword v4, off -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <5 x i32> @v5i32_func_void() #0 { +; GFX789-LABEL: v5i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v4, off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v5i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v4, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load volatile <5 x i32>, ptr addrspace(1) undef ret <5 x i32> %val } -; GCN-LABEL: {{^}}v8i32_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <8 x i32> @v8i32_func_void() #0 { +; GFX789-LABEL: v8i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v8i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i32>, ptr addrspace(1) %ptr ret <8 x i32> %val } -; GCN-LABEL: {{^}}v16i32_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <16 x i32> @v16i32_func_void() #0 { +; GFX789-LABEL: v16i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i32>, ptr addrspace(1) %ptr ret <16 x i32> %val } -; GCN-LABEL: {{^}}v32i32_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <32 x i32> @v32i32_func_void() #0 { +; GFX789-LABEL: v32i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX789-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX789-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX789-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v32i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <32 x i32>, ptr addrspace(1) %ptr ret <32 x i32> %val } -; GCN-LABEL: {{^}}v2i64_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <2 x i64> @v2i64_func_void() #0 { +; GFX789-LABEL: v2i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i64>, ptr addrspace(1) undef ret <2 x i64> %val } -; GCN-LABEL: {{^}}v3i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx2 v[4:5], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <3 x i64> @v3i64_func_void() #0 { +; GFX789-LABEL: v3i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <3 x i64>, ptr addrspace(1) %ptr ret <3 x i64> %val } -; GCN-LABEL: {{^}}v4i64_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN: buffer_load_dwordx4 v[4:7], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <4 x i64> @v4i64_func_void() #0 { +; GFX789-LABEL: v4i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <4 x i64>, ptr addrspace(1) %ptr ret <4 x i64> %val } -; GCN-LABEL: {{^}}v5i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx2 v[8:9], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <5 x i64> @v5i64_func_void() #0 { +; GFX789-LABEL: v5i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v5i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b64 v[8:9], off, s[0:3], 0 offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <5 x i64>, ptr addrspace(1) %ptr ret <5 x i64> %val } -; GCN-LABEL: {{^}}v8i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <8 x i64> @v8i64_func_void() #0 { +; GFX789-LABEL: v8i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v8i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i64>, ptr addrspace(1) %ptr ret <8 x i64> %val } -; GCN-LABEL: {{^}}v16i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <16 x i64> @v16i64_func_void() #0 { +; GFX789-LABEL: v16i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX789-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX789-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX789-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i64>, ptr addrspace(1) %ptr ret <16 x i64> %val } -; GCN-LABEL: {{^}}v2i16_func_void: -; GFX9: buffer_load_dword v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <2 x i16> @v2i16_func_void() #0 { +; CI-LABEL: v2i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v2i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i16>, ptr addrspace(1) undef ret <2 x i16> %val } -; GCN-LABEL: {{^}}v3i16_func_void: -; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <3 x i16> @v3i16_func_void() #0 { +; CI-LABEL: v3i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: v_mov_b32_e32 v2, v3 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v3i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i16>, ptr addrspace(1) undef ret <3 x i16> %val } -; GCN-LABEL: {{^}}v4i16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <4 x i16> @v4i16_func_void() #0 { +; CI-LABEL: v4i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_mov_b32_e32 v2, v1 +; CI-NEXT: v_mov_b32_e32 v1, v4 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v4i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i16>, ptr addrspace(1) undef ret <4 x i16> %val } -; GCN-LABEL: {{^}}v4f16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <4 x half> @v4f16_func_void() #0 { +; CI-LABEL: v4f16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v4f16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4f16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <4 x half>, ptr addrspace(1) undef ret <4 x half> %val } ; FIXME: Mixing buffer and global ; FIXME: Should not scalarize -; GCN-LABEL: {{^}}v5i16_func_void: -; GFX9: buffer_load_dwordx4 v[0:3] -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { +; CI-LABEL: v5i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: buffer_load_sshort v4, off, s[4:7], 0 offset:8 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_mov_b32_e32 v2, v1 +; CI-NEXT: v_mov_b32_e32 v1, v5 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v5i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v5i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <5 x i16>, ptr addrspace(1) %ptr ret <5 x i16> %val } -; GCN-LABEL: {{^}}v8i16_func_void: -; GFX9-DAG: buffer_load_dwordx4 v[0:3], off -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <8 x i16> @v8i16_func_void() #0 { +; CI-LABEL: v8i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; CI-NEXT: v_mov_b32_e32 v0, v8 +; CI-NEXT: v_mov_b32_e32 v2, v9 +; CI-NEXT: v_mov_b32_e32 v4, v10 +; CI-NEXT: v_mov_b32_e32 v6, v11 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v8i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v8i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i16>, ptr addrspace(1) %ptr ret <8 x i16> %val } -; GCN-LABEL: {{^}}v16i16_func_void: -; GFX9: buffer_load_dwordx4 v[0:3], off -; GFX9: buffer_load_dwordx4 v[4:7], off -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <16 x i16> @v16i16_func_void() #0 { +; CI-LABEL: v16i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[22:25], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[18:21], off, s[4:7], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; CI-NEXT: v_mov_b32_e32 v0, v22 +; CI-NEXT: v_mov_b32_e32 v2, v23 +; CI-NEXT: v_mov_b32_e32 v4, v24 +; CI-NEXT: v_mov_b32_e32 v6, v25 +; CI-NEXT: v_mov_b32_e32 v8, v18 +; CI-NEXT: v_mov_b32_e32 v10, v19 +; CI-NEXT: v_mov_b32_e32 v12, v20 +; CI-NEXT: v_mov_b32_e32 v14, v21 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v16i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i16>, ptr addrspace(1) %ptr ret <16 x i16> %val } ; FIXME: Should pack -; GCN-LABEL: {{^}}v16i8_func_void: -; GCN-DAG: v12 -; GCN-DAG: v13 -; GCN-DAG: v14 -; GCN-DAG: v15 define <16 x i8> @v16i8_func_void() #0 { +; GFX789-LABEL: v16i8_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX789-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX789-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX789-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX789-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX789-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX789-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX789-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX789-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX789-NEXT: v_mov_b32_e32 v4, v1 +; GFX789-NEXT: v_mov_b32_e32 v8, v2 +; GFX789-NEXT: v_mov_b32_e32 v12, v3 +; GFX789-NEXT: v_mov_b32_e32 v1, v16 +; GFX789-NEXT: v_mov_b32_e32 v2, v17 +; GFX789-NEXT: v_mov_b32_e32 v3, v18 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i8_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16 +; GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18 +; GFX11-NEXT: v_mov_b32_e32 v2, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i8>, ptr addrspace(1) %ptr ret <16 x i8> %val } ; FIXME: Should pack -; GCN-LABEL: {{^}}v4i8_func_void: -; GCN: buffer_load_dword v0 -; GCN-DAG: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0 -; GCN: s_setpc_b64 define <4 x i8> @v4i8_func_void() #0 { +; GFX789-LABEL: v4i8_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i8_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <4 x i8>, ptr addrspace(1) %ptr ret <4 x i8> %val } -; GCN-LABEL: {{^}}struct_i8_i32_func_void: -; GCN-DAG: buffer_load_dword v1 -; GCN-DAG: buffer_load_ubyte v0 -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define {i8, i32} @struct_i8_i32_func_void() #0 { +; GFX789-LABEL: struct_i8_i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: struct_i8_i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load { i8, i32 }, ptr addrspace(1) undef ret { i8, i32 } %val } -; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: -; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] -; GCN: buffer_load_dword [[VAL1:v[0-9]+]] -; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}} -; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}} define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %arg0) #0 { +; GFX789-LABEL: void_func_sret_struct_i8_i32: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; GFX789-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_sret_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 v0, v1, off +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile i8, ptr addrspace(1) undef %val1 = load volatile i32, ptr addrspace(1) undef %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0 @@ -471,140 +1352,939 @@ ; FIXME: Should be able to fold offsets in all of these pre-gfx9. Call ; lowering introduces an extra CopyToReg/CopyFromReg obscuring the ; AssertZext inserted. Not using it introduces the spills. - -; GCN-LABEL: {{^}}v33i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { +; CI-LABEL: v33i32_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; CI-NEXT: s_waitcnt vmcnt(11) +; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; CI-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; CI-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; CI-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v33i32_func_void: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX8-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX8-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x7c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 +; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GFX8-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v33i32_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v33i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[5:8], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0x70 +; GFX11-NEXT: s_add_i32 s2, s0, 0x60 +; GFX11-NEXT: s_add_i32 s3, s0, 0x50 +; GFX11-NEXT: s_add_i32 s4, s0, 64 +; GFX11-NEXT: s_add_i32 s5, s0, 48 +; GFX11-NEXT: s_add_i32 s6, s0, 32 +; GFX11-NEXT: s_add_i32 s7, s0, 16 +; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v33, s8 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <33 x i32>, ptr addrspace(1) %ptr ret <33 x i32> %val } -; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { +; CI-LABEL: struct_v32i32_i32_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; CI-NEXT: s_waitcnt vmcnt(11) +; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; CI-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; CI-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; CI-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: struct_v32i32_i32_func_void: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX8-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX8-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x7c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 +; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GFX8-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: struct_v32i32_i32_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: struct_v32i32_i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[5:8], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0x70 +; GFX11-NEXT: s_add_i32 s2, s0, 0x60 +; GFX11-NEXT: s_add_i32 s3, s0, 0x50 +; GFX11-NEXT: s_add_i32 s4, s0, 64 +; GFX11-NEXT: s_add_i32 s5, s0, 48 +; GFX11-NEXT: s_add_i32 s6, s0, 32 +; GFX11-NEXT: s_add_i32 s7, s0, 16 +; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v33, s8 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr ret { <32 x i32>, i32 }%val } -; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:132{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:136{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:140{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:144{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:148{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:152{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:156{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:160{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:164{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:168{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:172{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:176{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:180{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:184{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:188{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:192{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:196{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:200{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:204{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:208{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:212{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:216{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:220{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:224{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:228{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:232{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:236{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:240{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}} -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { +; CI-LABEL: struct_i32_v32i32_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 +; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 +; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 +; CI-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; CI-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 +; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 +; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 +; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v33, vcc, 0xfc, v0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xf8, v0 +; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xf4, v0 +; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xec, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xe4, v0 +; CI-NEXT: s_waitcnt vmcnt(11) +; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xe0, v0 +; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0 +; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xd8, v0 +; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xd4, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xd0, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xcc, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xc8, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xb8, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0xc4, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xbc, v0 +; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xb4, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0xb0, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xac, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 0xa8, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 0xa4, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xa0, v0 +; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 +; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x98, v0 +; CI-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x94, v0 +; CI-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x90, v0 +; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 +; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x84, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 +; CI-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: struct_i32_v32i32_func_void: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 +; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 +; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 +; GFX8-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; GFX8-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 +; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 +; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 +; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0xfc, v0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf8, v0 +; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xf4, v0 +; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xf0, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xec, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xe8, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xe4, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xd4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd0, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xcc, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xc8, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xb8, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xc4, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc0, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xb4, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xb0, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xac, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xa8, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xa4, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xa0, v0 +; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x9c, v0 +; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x98, v0 +; GFX8-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x94, v0 +; GFX8-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x90, v0 +; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x8c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x88, v0 +; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x84, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: struct_i32_v32i32_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 +; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 +; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 +; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 +; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:252 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:156 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: struct_i32_v32i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:240 +; GFX11-NEXT: buffer_load_b128 v[5:8], off, s[0:3], 0 offset:224 +; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:208 +; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:192 +; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:176 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:160 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128 +; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 +; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 +; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 +; GFX11-NEXT: s_add_i32 s4, s0, 0xc0 +; GFX11-NEXT: s_add_i32 s5, s0, 0xb0 +; GFX11-NEXT: s_add_i32 s6, s0, 0xa0 +; GFX11-NEXT: s_add_i32 s7, s0, 0x90 +; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v33, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr ret { i32, <32 x i32> }%val } ; Make sure the last struct component is returned in v3, not v4. -; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg: -; GCN: ds_read_b32 v0, -; GCN: ds_read_b32 v1, -; GCN: ds_read_b32 v2, -; GCN: ds_read_b32 v3, define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 { +; CI-LABEL: v3i32_struct_func_void_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_read_b32 v1, v0 +; CI-NEXT: ds_read_b32 v2, v0 +; CI-NEXT: ds_read_b32 v3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v3i32_struct_func_void_wasted_reg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v3i32_struct_func_void_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v0 +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: ds_read_b32 v3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i32_struct_func_void_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v1, v0 +; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load0 = load volatile i32, ptr addrspace(3) undef %load1 = load volatile i32, ptr addrspace(3) undef %load2 = load volatile i32, ptr addrspace(3) undef @@ -618,12 +2298,53 @@ ret { <3 x i32>, i32 } %insert.4 } -; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg: -; GCN: ds_read_b32 v0, -; GCN: ds_read_b32 v1, -; GCN: ds_read_b32 v2, -; GCN: ds_read_b32 v3, define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 { +; CI-LABEL: v3f32_struct_func_void_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_read_b32 v1, v0 +; CI-NEXT: ds_read_b32 v2, v0 +; CI-NEXT: ds_read_b32 v3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v3f32_struct_func_void_wasted_reg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v3f32_struct_func_void_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v0 +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: ds_read_b32 v3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3f32_struct_func_void_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v1, v0 +; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load0 = load volatile float, ptr addrspace(3) undef %load1 = load volatile float, ptr addrspace(3) undef %load2 = load volatile float, ptr addrspace(3) undef @@ -637,14 +2358,54 @@ ret { <3 x float>, i32 } %insert.4 } -; GCN-LABEL: {{^}}void_func_sret_max_known_zero_bits: -; GCN: v_lshrrev_b32_e32 [[LSHR16:v[0-9]+]], 16, v0 -; GCN: ds_write_b32 {{v[0-9]+}}, [[LSHR16]] - -; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0 -; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] -; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0) #0 { +; CI-LABEL: void_func_sret_max_known_zero_bits: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: void_func_sret_max_known_zero_bits: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_sret_max_known_zero_bits: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_sret_max_known_zero_bits: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 17, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.int = ptrtoint ptr addrspace(5) %arg0 to i32 %lshr0 = lshr i32 %arg0.int, 16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -1,12 +1,24 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +; GFX68-LABEL: buffer_store: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX68-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +; GFX68-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 glc +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -14,34 +26,65 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +; GFX68-LABEL: buffer_store_immoffs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_immoffs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +; GFX68-LABEL: buffer_store_ofs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_ofs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +; VERDE-LABEL: buffer_store_wait: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt expcnt(0) +; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_wait: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_wait: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 offen +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) @@ -49,29 +92,52 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +; GFX68-LABEL: buffer_store_x1: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +; GFX68-LABEL: buffer_store_x2: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offen_merged_and: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offen_merged_and: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -87,11 +153,22 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offen_merged_or: +; GFX68: ; %bb.0: +; GFX68-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offen_merged_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -108,13 +185,22 @@ ret void } - -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offen_merged_glc_slc: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offen_merged_glc_slc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -130,10 +216,17 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +; GFX68-LABEL: buffer_store_x2_offen_merged_and: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2_offen_merged_and: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) @@ -141,10 +234,19 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { +; GFX68-LABEL: buffer_store_x2_offen_merged_or: +; GFX68: ; %bb.0: +; GFX68-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2_offen_merged_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -153,11 +255,20 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offset_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offset_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -167,21 +278,38 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +; GFX68-LABEL: buffer_store_x2_offset_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2_offset_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc -;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +; GFX68-LABEL: buffer_store_int: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +; GFX68-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_int: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 glc +; GFX11-NEXT: buffer_store_b32 v6, off, s[0:3], 0 slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -189,12 +317,19 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +; GFX68-LABEL: raw_buffer_store_byte: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_byte: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -202,12 +337,19 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_short: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +; GFX68-LABEL: raw_buffer_store_short: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_short: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -215,12 +357,17 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { +; GFX68-LABEL: raw_buffer_store_f16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_f16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half @@ -228,59 +375,142 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_v2f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v2f16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v2f16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 +; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v4f16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v4f16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { +; GFX68-LABEL: raw_buffer_store_i16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v2i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v2i16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v2i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v4i16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v4i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: raw_buffer_store_x1_offset_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_x1_offset_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -290,14 +520,28 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_swizzled_not_merged: -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 +; GFX68-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; GFX68-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 +; GFX68-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 +; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 offset:8 +; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 offset:12 +; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 offset:28 +; GFX11-NEXT: buffer_store_b32 v5, off, s[0:3], 0 offset:32 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -1,12 +1,26 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s + define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +; GFX68-LABEL: buffer_store: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v12, 0 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen +; GFX68-NEXT: buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc +; GFX68-NEXT: buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], 0 idxen +; GFX11-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc +; GFX11-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -14,62 +28,123 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +; GFX68-LABEL: buffer_store_immoffs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v4, 0 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_immoffs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_idx: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +; GFX68-LABEL: buffer_store_idx: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_idx: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +; GFX68-LABEL: buffer_store_ofs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: s_mov_b32 s4, 0 +; GFX68-NEXT: v_mov_b32_e32 v5, v4 +; GFX68-NEXT: v_mov_b32_e32 v4, s4 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_ofs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +; GFX68-LABEL: buffer_store_both: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_both: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both_reversed: -;CHECK: v_mov_b32_e32 v6, v4 -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +; GFX68-LABEL: buffer_store_both_reversed: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v6, v4 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_both_reversed: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +; VERDE-LABEL: buffer_store_wait: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; VERDE-NEXT: s_waitcnt expcnt(0) +; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_wait: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_wait: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen +; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 idxen +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) @@ -77,30 +152,56 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +; GFX68-LABEL: buffer_store_x1: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +; GFX68-LABEL: buffer_store_x2: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +; GFX68-LABEL: buffer_store_int: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v7, 0 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc +; GFX68-NEXT: buffer_store_dword v6, v7, s[0:3], 0 idxen slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_int: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], 0 idxen +; GFX11-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc +; GFX11-NEXT: buffer_store_b32 v6, v7, s[0:3], 0 idxen slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -108,12 +209,19 @@ ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +; GFX68-LABEL: struct_buffer_store_byte: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_byte v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_byte: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b8 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -121,39 +229,89 @@ ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +; GFX68-LABEL: struct_buffer_store_f16: +; GFX68: ; %bb.0: +; GFX68-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %v2 = fptrunc float %v1 to half call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_v2f16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_v2f16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_v4f16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_v4f16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 +; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +; GFX68-LABEL: struct_buffer_store_i16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -161,18 +319,51 @@ ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_vif16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_vif16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_vif16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_vif16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_v4i16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_v4i16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -192,6 +383,5 @@ declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) #0 - attributes #0 = { nounwind } attributes #1 = { nounwind readonly }