diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s -; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s declare hidden void @external_void_func_i1(i1) #0 declare hidden void @external_void_func_i1_signext(i1 signext) #0 @@ -57,221 +59,1422 @@ declare hidden void @external_void_func_v16i8(<16 x i8>) #0 - ; FIXME: Should be passing -1 -; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm: -; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD - -; MESA-DAG: s_mov_b64 s[0:1], s[36:37] - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+12 -; GCN-DAG: v_mov_b32_e32 v0, 1{{$}} -; MESA-DAG: s_mov_b64 s[2:3], s[38:39] - -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { +; VI-LABEL: test_call_external_void_func_i1_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i1_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i1_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i1_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i1_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i1(i1 true) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: - -; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA: s_mov_b32 s32, 0 -; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] -; MESA-DAG: s_mov_b32 s32, 0{{$}} - -; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12 -; GCN-NEXT: v_bfe_i32 v0, [[VAR]], 0, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i1_signext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i1_signext: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; CI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i1_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i1_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext@rel32@hi+12 +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i1_signext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12 +; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) undef call void @external_void_func_i1_signext(i1 signext %var) ret void } ; FIXME: load should be scheduled before getpc -; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: - -; HSA: buffer_load_ubyte [[VAL:v[0-9]+]] -; HSA-DAG: s_mov_b32 s32, 0{{$}} - -; MESA: buffer_load_ubyte [[VAL:v[0-9]+]] -; MESA-DAG: s_mov_b32 s32, 0{{$}} - -; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 -; GCN-NEXT: v_and_b32_e32 v0, 1, [[VAL]] -; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i1_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i1_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; CI-NEXT: v_and_b32_e32 v0, 1, v0 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i1_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i1_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext@rel32@hi+12 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i1_zeroext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_zeroext@rel32@hi+12 +; HSA-NEXT: v_and_b32_e32 v0, 1, v0 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) undef call void @external_void_func_i1_zeroext(i1 zeroext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+12 -; GCN-DAG: v_mov_b32_e32 v0, 0x7b - -; GCN-DAG: s_mov_b32 s32, 0{{$}} - -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { +; VI-LABEL: test_call_external_void_func_i8_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i8_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i8_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i8_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i8_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i8(i8 123) ret void } ; FIXME: don't wait before call -; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: - -; GCN-DAG: buffer_load_sbyte [[VAL:v[0-9]+]] -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i8_signext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i8_signext: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i8_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i8_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i8_signext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_signext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef call void @external_void_func_i8_signext(i8 signext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: - -; GCN-DAG: buffer_load_ubyte [[VAL:v[0-9]+]] -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i8_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i8_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i8_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i8_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i8_zeroext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_zeroext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef call void @external_void_func_i8_zeroext(i8 zeroext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm: -; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { +; VI-LABEL: test_call_external_void_func_i16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i16_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i16(i16 123) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: - -; GCN-DAG: buffer_load_sshort [[VAL:v[0-9]+]] -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i16_signext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i16_signext: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i16_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i16_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i16_signext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_signext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef call void @external_void_func_i16_signext(i16 signext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext: - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i16_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i16_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i16_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i16_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i16_zeroext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_zeroext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef call void @external_void_func_i16_zeroext(i16 zeroext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+12 -; GCN-DAG: v_mov_b32_e32 v0, 42 -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { +; VI-LABEL: test_call_external_void_func_i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 42 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i32(i32 42) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm: -; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} -; GCN-DAG: v_mov_b32_e32 v1, 0{{$}} -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+12 -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { +; VI-LABEL: test_call_external_void_func_i64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_i64_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i64(i64 123) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i64: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { +; VI-LABEL: test_call_external_void_func_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2i64: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b32 s0, 0 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s1, s0 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i64: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) null call void @external_void_func_v2i64(<2 x i64> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2i64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2i64_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: v_mov_b32_e32 v3, 4 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2i64(<2 x i64> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i64: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN: v_mov_b32_e32 v4, 1 -; GCN: v_mov_b32_e32 v5, 2 -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { +; VI-LABEL: test_call_external_void_func_v3i64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, 2 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3i64: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b32 s0, 0 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s1, s0 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v4, 1 +; CI-NEXT: v_mov_b32_e32 v5, 2 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i64: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v5, 2 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> @@ -279,343 +1482,3110 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN-DAG: v_mov_b32_e32 v4, 1 -; GCN-DAG: v_mov_b32_e32 v5, 2 -; GCN-DAG: v_mov_b32_e32 v6, 3 -; GCN-DAG: v_mov_b32_e32 v7, 4 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { +; VI-LABEL: test_call_external_void_func_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, 2 +; VI-NEXT: v_mov_b32_e32 v6, 3 +; VI-NEXT: v_mov_b32_e32 v7, 4 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v4i64: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b32 s0, 0 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_mov_b32 s1, s0 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v4, 1 +; CI-NEXT: v_mov_b32_e32 v5, 2 +; CI-NEXT: v_mov_b32_e32 v6, 3 +; CI-NEXT: v_mov_b32_e32 v7, 4 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: v_mov_b32_e32 v6, 3 +; GFX9-NEXT: v_mov_b32_e32 v7, 4 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i64: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: v_mov_b32_e32 v5, 2 +; HSA-NEXT: v_mov_b32_e32 v6, 3 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v7, 4 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> call void @external_void_func_v4i64(<4 x i64> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm: -; VI: v_mov_b32_e32 v0, 0x4400 -; CI: v_mov_b32_e32 v0, 4.0 -; GCN-NOT: v0 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { +; VI-LABEL: test_call_external_void_func_f16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_f16_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 4.0 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_f16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_f16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_f16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_f16(half 4.0) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm: -; GCN: v_mov_b32_e32 v0, 4.0 -; GCN-NOT: v0 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 4.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_f32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 4.0 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 4.0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_f32(float 4.0) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1.0 -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2f32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1.0 +; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1.0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2f32(<2 x float> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1.0 -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN-DAG: v_mov_b32_e32 v2, 4.0 -; GCN-NOT: v3, -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 4.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3f32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1.0 +; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: v_mov_b32_e32 v2, 4.0 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1.0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 4.0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3f32(<3 x float> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v5f32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1.0 -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN-DAG: v_mov_b32_e32 v2, 4.0 -; GCN-DAG: v_mov_b32_e32 v3, -1.0 -; GCN-DAG: v_mov_b32_e32 v4, 0.5 -; GCN-NOT: v5, -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v5f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 4.0 +; VI-NEXT: v_mov_b32_e32 v3, -1.0 +; VI-NEXT: v_mov_b32_e32 v4, 0.5 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v5f32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1.0 +; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: v_mov_b32_e32 v2, 4.0 +; CI-NEXT: v_mov_b32_e32 v3, -1.0 +; CI-NEXT: v_mov_b32_e32 v4, 0.5 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v5f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v5f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v5f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1.0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 4.0 +; HSA-NEXT: v_mov_b32_e32 v3, -1.0 +; HSA-NEXT: v_mov_b32_e32 v4, 0.5 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v5f32(<5 x float> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm: -; GCN: v_mov_b32_e32 v0, 0{{$}} -; GCN: v_mov_b32_e32 v1, 0x40100000 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { +; VI-LABEL: test_call_external_void_func_f64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x40100000 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_f64_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0x40100000 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_f64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_f64(double 4.0) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm: -; GCN: v_mov_b32_e32 v0, 0{{$}} -; GCN: v_mov_b32_e32 v1, 2.0 -; GCN: v_mov_b32_e32 v2, 0{{$}} -; GCN: v_mov_b32_e32 v3, 0x40100000 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2f64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40100000 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2f64_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v3, 0x40100000 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2f64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2f64(<2 x double> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm: -; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN-DAG: v_mov_b32_e32 v2, 0{{$}} -; GCN-DAG: v_mov_b32_e32 v3, 0x40100000 -; GCN-DAG: v_mov_b32_e32 v4, 0{{$}} -; GCN-DAG: v_mov_b32_e32 v5, 0x40200000 -; GCN-DAG: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3f64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40100000 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40200000 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3f64_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v3, 0x40100000 +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v5, 0x40200000 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 +; HSA-NEXT: v_mov_b32_e32 v4, 0 +; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3f64(<3 x double> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i16: -; GFX9: buffer_load_dword v0 -; GFX9-NOT: v0 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { +; VI-LABEL: test_call_external_void_func_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x i16>, ptr addrspace(1) undef call void @external_void_func_v2i16(<2 x i16> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i16: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { +; VI-LABEL: test_call_external_void_func_v3i16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3i16: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: v_mov_b32_e32 v2, v3 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <3 x i16>, ptr addrspace(1) undef call void @external_void_func_v3i16(<3 x i16> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f16: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { +; VI-LABEL: test_call_external_void_func_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3f16: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) undef call void @external_void_func_v3f16(<3 x half> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm: -; GFX9: v_mov_b32_e32 v0, 0x20001 -; GFX9: v_mov_b32_e32 v1, 3 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3i16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x20001 +; VI-NEXT: v_mov_b32_e32 v1, 3 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3i16_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 +; HSA-NEXT: v_mov_b32_e32 v1, 3 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3i16(<3 x i16> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f16_imm: -; GFX9: v_mov_b32_e32 v0, 0x40003c00 -; GFX9: v_mov_b32_e32 v1, 0x4400 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3f16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3f16_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1.0 +; CI-NEXT: v_mov_b32_e32 v1, 2.0 +; CI-NEXT: v_mov_b32_e32 v2, 4.0 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3f16(<3 x half> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i16: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { +; VI-LABEL: test_call_external_void_func_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_mov_b32_e32 v2, v1 +; CI-NEXT: v_mov_b32_e32 v1, v4 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <4 x i16>, ptr addrspace(1) undef call void @external_void_func_v4i16(<4 x i16> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm: -; GFX9-DAG: v_mov_b32_e32 v0, 0x20001 -; GFX9-DAG: v_mov_b32_e32 v1, 0x40003 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { +; VI-LABEL: test_call_external_void_func_v4i16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x20001 +; VI-NEXT: v_mov_b32_e32 v1, 0x40003 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v4i16_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: v_mov_b32_e32 v3, 4 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 +; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v4i16(<4 x i16> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2f16: -; GFX9: buffer_load_dword v0 -; GFX9-NOT: v0 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { +; VI-LABEL: test_call_external_void_func_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2f16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) undef call void @external_void_func_v2f16(<2 x half> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i32: -; GCN: buffer_load_dwordx2 v[0:1] -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { +; VI-LABEL: test_call_external_void_func_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) undef call void @external_void_func_v2i32(<2 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v2i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2i32(<2 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} - -; GCN-NOT: v3{{$}} -; GCN-DAG: v_mov_b32_e32 v0, 3 -; GCN-DAG: v_mov_b32_e32 v1, 4 -; GCN-DAG: v_mov_b32_e32 v2, 5 - -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { +; VI-LABEL: test_call_external_void_func_v3i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: v_mov_b32_e32 v1, 4 +; VI-NEXT: v_mov_b32_e32 v2, 5 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 3 +; CI-NEXT: v_mov_b32_e32 v1, 4 +; CI-NEXT: v_mov_b32_e32 v2, 5 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_mov_b32_e32 v2, 5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: v_mov_b32_e32 v1, 4 +; HSA-NEXT: v_mov_b32_e32 v2, 5 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3i32(<3 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32: -; GCN-DAG: v_mov_b32_e32 v0, 3 -; GCN-DAG: v_mov_b32_e32 v1, 4 -; GCN-DAG: v_mov_b32_e32 v2, 5 -; GCN-DAG: v_mov_b32_e32 v3, 6 define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { +; VI-LABEL: test_call_external_void_func_v3i32_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: v_mov_b32_e32 v1, 4 +; VI-NEXT: v_mov_b32_e32 v2, 5 +; VI-NEXT: v_mov_b32_e32 v3, 6 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v3i32_i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 3 +; CI-NEXT: v_mov_b32_e32 v1, 4 +; CI-NEXT: v_mov_b32_e32 v2, 5 +; CI-NEXT: v_mov_b32_e32 v3, 6 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_mov_b32_e32 v3, 6 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i32_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: v_mov_b32_e32 v1, 4 +; HSA-NEXT: v_mov_b32_e32 v2, 5 +; HSA-NEXT: v_mov_b32_e32 v3, 6 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3i32_i32(<3 x i32> , i32 6) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i32: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { +; VI-LABEL: test_call_external_void_func_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v4i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) undef call void @external_void_func_v4i32(<4 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v4i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v4i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: v_mov_b32_e32 v3, 4 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v4i32(<4 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v5i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN-DAG: v_mov_b32_e32 v4, 5 -; GCN-NOT: v5, -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v5i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: v_mov_b32_e32 v4, 5 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v5i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: v_mov_b32_e32 v3, 4 +; CI-NEXT: v_mov_b32_e32 v4, 5 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v5i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v5i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_mov_b32_e32 v4, 5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v5i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: v_mov_b32_e32 v4, 5 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v5i32(<5 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { +; VI-LABEL: test_call_external_void_func_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v8i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v8i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v8i32(<8 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN-DAG: v_mov_b32_e32 v4, 5 -; GCN-DAG: v_mov_b32_e32 v5, 6 -; GCN-DAG: v_mov_b32_e32 v6, 7 -; GCN-DAG: v_mov_b32_e32 v7, 8 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v8i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: v_mov_b32_e32 v4, 5 +; VI-NEXT: v_mov_b32_e32 v5, 6 +; VI-NEXT: v_mov_b32_e32 v6, 7 +; VI-NEXT: v_mov_b32_e32 v7, 8 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v8i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: v_mov_b32_e32 v1, 2 +; CI-NEXT: v_mov_b32_e32 v2, 3 +; CI-NEXT: v_mov_b32_e32 v3, 4 +; CI-NEXT: v_mov_b32_e32 v4, 5 +; CI-NEXT: v_mov_b32_e32 v5, 6 +; CI-NEXT: v_mov_b32_e32 v6, 7 +; CI-NEXT: v_mov_b32_e32 v7, 8 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v8i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: v_mov_b32_e32 v5, 6 +; GFX9-NEXT: v_mov_b32_e32 v6, 7 +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v8i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 +; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v8i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: v_mov_b32_e32 v4, 5 +; HSA-NEXT: v_mov_b32_e32 v5, 6 +; HSA-NEXT: v_mov_b32_e32 v6, 7 +; HSA-NEXT: v_mov_b32_e32 v7, 8 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v8i32(<8 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v16i32: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { +; VI-LABEL: test_call_external_void_func_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v16i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v16i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v16i32(<16 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v32i32: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: buffer_store_dword v31, off, s{{\[[0-9]+:[0-9]+\]}}, s32 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { +; VI-LABEL: test_call_external_void_func_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[8:9] +; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v32i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_getpc_b64 s[8:9] +; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b32 off, v31, s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v32i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[12:13] +; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(7) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <32 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v32i32(<32 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32: -; HSA-NOT: s_add_u32 s32 - -; MESA-NOT: s_add_u32 s32 - -; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off - -; GCN: s_waitcnt -; GCN-DAG: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword v31, off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { +; VI-LABEL: test_call_external_void_func_v32i32_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v32i32_i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v32i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v32i32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32_i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112 +; GFX11-NEXT: buffer_load_b32 v32, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_add_i32 s4, s32, 4 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b32 off, v31, s32 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b32 off, v32, s4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v32i32_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(8) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(8) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef %val0 = load <32 x i32>, ptr addrspace(1) %ptr0 %val1 = load i32, ptr addrspace(1) undef @@ -623,54 +4593,366 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm: -; GCN: v_mov_b32_e32 v0, 42 -; GCN: s_swappc_b64 s[30:31], -; GCN-NOT: s_waitcnt -; GCN: buffer_store_dword v0, off, s[36:39], 0 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 { +; VI-LABEL: test_call_external_i32_func_i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s42, -1 +; VI-NEXT: s_mov_b32 s43, 0xe80000 +; VI-NEXT: s_add_u32 s40, s40, s5 +; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[40:41] +; VI-NEXT: s_mov_b64 s[2:3], s[42:43] +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_mov_b32 s39, 0xf000 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_i32_func_i32_imm: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s42, -1 +; CI-NEXT: s_mov_b32 s43, 0xe8f000 +; CI-NEXT: s_add_u32 s40, s40, s5 +; CI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; CI-NEXT: s_addc_u32 s41, s41, 0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[40:41] +; CI-NEXT: s_mov_b64 s[2:3], s[42:43] +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_mov_b32 s39, 0xf000 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_i32_func_i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: s_add_u32 s40, s40, s5 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_mov_b32 s39, 0xf000 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_i32_func_i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[36:37], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b32 s39, 0x31016000 +; GFX11-NEXT: s_mov_b32 s38, -1 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_i32_func_i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 42 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_mov_b32 s39, 0x1100f000 +; HSA-NEXT: s_mov_b32 s38, -1 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_endpgm %val = call i32 @external_i32_func_i32(i32 42) store volatile i32 %val, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: -; GCN: buffer_load_ubyte v0, off -; GCN: buffer_load_dword v1, off -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { +; VI-LABEL: test_call_external_void_func_struct_i8_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_struct_i8_i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_struct_i8_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; HSA-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef %val = load { i8, i32 }, ptr addrspace(1) %ptr0 call void @external_void_func_struct_i8_i32({ i8, i32 } %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32: -; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 -; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], 0 offset:8 -; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], 0 offset:12 - -; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8 -; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12 - -; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12 -; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8 - -; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12 -; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8 - -; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}} - -; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} -; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 - -; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}} -; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4 - -; GCN-NEXT: s_swappc_b64 -; GCN-NOT: [[SP]] define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 { +; VI-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 8 +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_movk_i32 s32, 0x400 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: v_mov_b32_e32 v0, 3 +; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; CI-NEXT: v_mov_b32_e32 v0, 8 +; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_movk_i32 s32, 0x400 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_mov_b32 s32, 16 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; HSA-NEXT: s_movk_i32 s32, 0x400 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = alloca { i8, i32 }, align 8, addrspace(5) %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1 @@ -680,28 +4962,186 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: -; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x800{{$}} - -; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 -; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 - -; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 - -; GCN-NOT: s_add_u32 [[SP]] -; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}} -; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 -; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:20 -; GCN-NOT: s_sub_u32 [[SP]] - -; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off -; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { +; VI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 8 +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; VI-NEXT: s_movk_i32 s32, 0x800 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s5 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: v_mov_b32_e32 v0, 3 +; CI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; CI-NEXT: v_mov_b32_e32 v0, 8 +; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; CI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; CI-NEXT: s_movk_i32 s32, 0x800 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; CI-NEXT: v_mov_b32_e32 v0, 16 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 +; CI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_mov_b32 s32, 32 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: v_mov_b32_e32 v0, 16 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u8 v0, off, off offset:16 +; GFX11-NEXT: scratch_load_b32 v1, off, off offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; HSA-NEXT: s_movk_i32 s32, 0x800 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 16 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:20 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_endpgm %in.val = alloca { i8, i32 }, align 8, addrspace(5) %out.val = alloca { i8, i32 }, align 8, addrspace(5) %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0 @@ -719,74 +5159,1000 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v16i8: define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { +; VI-LABEL: test_call_external_void_func_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v12, v3 +; VI-NEXT: v_mov_b32_e32 v1, v16 +; VI-NEXT: v_mov_b32_e32 v2, v17 +; VI-NEXT: v_mov_b32_e32 v3, v18 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: test_call_external_void_func_v16i8: +; CI: ; %bb.0: +; CI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s38, -1 +; CI-NEXT: s_mov_b32 s39, 0xe8f000 +; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: s_addc_u32 s37, s37, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; CI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; CI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; CI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; CI-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; CI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; CI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; CI-NEXT: v_mov_b32_e32 v4, v1 +; CI-NEXT: v_mov_b32_e32 v8, v2 +; CI-NEXT: v_mov_b32_e32 v12, v3 +; CI-NEXT: v_mov_b32_e32 v1, v16 +; CI-NEXT: v_mov_b32_e32 v2, v17 +; CI-NEXT: v_mov_b32_e32 v3, v18 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-NEXT: v_mov_b32_e32 v2, v17 +; GFX9-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16 +; GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18 +; GFX11-NEXT: v_mov_b32_e32 v2, v17 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v16i8: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i8@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i8@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; HSA-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; HSA-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; HSA-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; HSA-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; HSA-NEXT: v_mov_b32_e32 v4, v1 +; HSA-NEXT: v_mov_b32_e32 v8, v2 +; HSA-NEXT: v_mov_b32_e32 v12, v3 +; HSA-NEXT: v_mov_b32_e32 v1, v16 +; HSA-NEXT: v_mov_b32_e32 v2, v17 +; HSA-NEXT: v_mov_b32_e32 v3, v18 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i8>, ptr addrspace(1) %ptr call void @external_void_func_v16i8(<16 x i8> %val) ret void } -; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 -; GCN: s_swappc_b64 define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { +; VI-LABEL: stack_passed_arg_alignment_v32i32_f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s54, -1 +; VI-NEXT: s_mov_b32 s55, 0xe80000 +; VI-NEXT: s_add_u32 s52, s52, s5 +; VI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_addc_u32 s53, s53, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s23 +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[52:53] +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; VI-NEXT: s_mov_b64 s[2:3], s[54:55] +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: v_mov_b32_e32 v2, s38 +; VI-NEXT: v_mov_b32_e32 v3, s39 +; VI-NEXT: v_mov_b32_e32 v4, s40 +; VI-NEXT: v_mov_b32_e32 v5, s41 +; VI-NEXT: v_mov_b32_e32 v6, s42 +; VI-NEXT: v_mov_b32_e32 v7, s43 +; VI-NEXT: v_mov_b32_e32 v8, s44 +; VI-NEXT: v_mov_b32_e32 v9, s45 +; VI-NEXT: v_mov_b32_e32 v10, s46 +; VI-NEXT: v_mov_b32_e32 v11, s47 +; VI-NEXT: v_mov_b32_e32 v12, s48 +; VI-NEXT: v_mov_b32_e32 v13, s49 +; VI-NEXT: v_mov_b32_e32 v14, s50 +; VI-NEXT: v_mov_b32_e32 v15, s51 +; VI-NEXT: v_mov_b32_e32 v16, s8 +; VI-NEXT: v_mov_b32_e32 v17, s9 +; VI-NEXT: v_mov_b32_e32 v18, s10 +; VI-NEXT: v_mov_b32_e32 v19, s11 +; VI-NEXT: v_mov_b32_e32 v20, s12 +; VI-NEXT: v_mov_b32_e32 v21, s13 +; VI-NEXT: v_mov_b32_e32 v22, s14 +; VI-NEXT: v_mov_b32_e32 v23, s15 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; CI-LABEL: stack_passed_arg_alignment_v32i32_f64: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; CI-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; CI-NEXT: s_mov_b32 s54, -1 +; CI-NEXT: s_mov_b32 s55, 0xe8f000 +; CI-NEXT: s_add_u32 s52, s52, s5 +; CI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; CI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x29 +; CI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x9 +; CI-NEXT: s_mov_b32 s32, 0 +; CI-NEXT: s_addc_u32 s53, s53, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s23 +; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; CI-NEXT: v_mov_b32_e32 v0, s5 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], s[52:53] +; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; CI-NEXT: s_mov_b64 s[2:3], s[54:55] +; CI-NEXT: v_mov_b32_e32 v0, s36 +; CI-NEXT: v_mov_b32_e32 v1, s37 +; CI-NEXT: v_mov_b32_e32 v2, s38 +; CI-NEXT: v_mov_b32_e32 v3, s39 +; CI-NEXT: v_mov_b32_e32 v4, s40 +; CI-NEXT: v_mov_b32_e32 v5, s41 +; CI-NEXT: v_mov_b32_e32 v6, s42 +; CI-NEXT: v_mov_b32_e32 v7, s43 +; CI-NEXT: v_mov_b32_e32 v8, s44 +; CI-NEXT: v_mov_b32_e32 v9, s45 +; CI-NEXT: v_mov_b32_e32 v10, s46 +; CI-NEXT: v_mov_b32_e32 v11, s47 +; CI-NEXT: v_mov_b32_e32 v12, s48 +; CI-NEXT: v_mov_b32_e32 v13, s49 +; CI-NEXT: v_mov_b32_e32 v14, s50 +; CI-NEXT: v_mov_b32_e32 v15, s51 +; CI-NEXT: v_mov_b32_e32 v16, s8 +; CI-NEXT: v_mov_b32_e32 v17, s9 +; CI-NEXT: v_mov_b32_e32 v18, s10 +; CI-NEXT: v_mov_b32_e32 v19, s11 +; CI-NEXT: v_mov_b32_e32 v20, s12 +; CI-NEXT: v_mov_b32_e32 v21, s13 +; CI-NEXT: v_mov_b32_e32 v22, s14 +; CI-NEXT: v_mov_b32_e32 v23, s15 +; CI-NEXT: v_mov_b32_e32 v24, s16 +; CI-NEXT: v_mov_b32_e32 v25, s17 +; CI-NEXT: v_mov_b32_e32 v26, s18 +; CI-NEXT: v_mov_b32_e32 v27, s19 +; CI-NEXT: v_mov_b32_e32 v28, s20 +; CI-NEXT: v_mov_b32_e32 v29, s21 +; CI-NEXT: v_mov_b32_e32 v30, s22 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: s_endpgm +; +; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s54, -1 +; GFX9-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-NEXT: s_add_u32 s52, s52, s5 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s23 +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_mov_b32_e32 v2, s38 +; GFX9-NEXT: v_mov_b32_e32 v3, s39 +; GFX9-NEXT: v_mov_b32_e32 v4, s40 +; GFX9-NEXT: v_mov_b32_e32 v5, s41 +; GFX9-NEXT: v_mov_b32_e32 v6, s42 +; GFX9-NEXT: v_mov_b32_e32 v7, s43 +; GFX9-NEXT: v_mov_b32_e32 v8, s44 +; GFX9-NEXT: v_mov_b32_e32 v9, s45 +; GFX9-NEXT: v_mov_b32_e32 v10, s46 +; GFX9-NEXT: v_mov_b32_e32 v11, s47 +; GFX9-NEXT: v_mov_b32_e32 v12, s48 +; GFX9-NEXT: v_mov_b32_e32 v13, s49 +; GFX9-NEXT: v_mov_b32_e32 v14, s50 +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[20:21], s[2:3], 0xa4 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s22, s32, 8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20 +; GFX11-NEXT: v_mov_b32_e32 v2, s19 +; GFX11-NEXT: s_add_i32 s19, s32, 4 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: scratch_store_b32 off, v0, s22 +; GFX11-NEXT: scratch_store_b32 off, v1, s19 +; GFX11-NEXT: scratch_store_b32 off, v2, s32 +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38 +; GFX11-NEXT: v_dual_mov_b32 v5, s41 :: v_dual_mov_b32 v6, s42 +; GFX11-NEXT: v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v8, s44 +; GFX11-NEXT: v_dual_mov_b32 v11, s47 :: v_dual_mov_b32 v10, s46 +; GFX11-NEXT: v_dual_mov_b32 v13, s49 :: v_dual_mov_b32 v12, s48 +; GFX11-NEXT: v_dual_mov_b32 v15, s51 :: v_dual_mov_b32 v14, s50 +; GFX11-NEXT: v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4 +; GFX11-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6 +; GFX11-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8 +; GFX11-NEXT: v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10 +; GFX11-NEXT: v_dual_mov_b32 v25, s13 :: v_dual_mov_b32 v24, s12 +; GFX11-NEXT: v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14 +; GFX11-NEXT: v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16 +; GFX11-NEXT: v_mov_b32_e32 v30, s18 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; HSA-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x80 +; HSA-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: v_mov_b32_e32 v0, s23 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, s24 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, s25 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, s36 +; HSA-NEXT: v_mov_b32_e32 v1, s37 +; HSA-NEXT: v_mov_b32_e32 v2, s38 +; HSA-NEXT: v_mov_b32_e32 v3, s39 +; HSA-NEXT: v_mov_b32_e32 v4, s40 +; HSA-NEXT: v_mov_b32_e32 v5, s41 +; HSA-NEXT: v_mov_b32_e32 v6, s42 +; HSA-NEXT: v_mov_b32_e32 v7, s43 +; HSA-NEXT: v_mov_b32_e32 v8, s44 +; HSA-NEXT: v_mov_b32_e32 v9, s45 +; HSA-NEXT: v_mov_b32_e32 v10, s46 +; HSA-NEXT: v_mov_b32_e32 v11, s47 +; HSA-NEXT: v_mov_b32_e32 v12, s48 +; HSA-NEXT: v_mov_b32_e32 v13, s49 +; HSA-NEXT: v_mov_b32_e32 v14, s50 +; HSA-NEXT: v_mov_b32_e32 v15, s51 +; HSA-NEXT: v_mov_b32_e32 v16, s8 +; HSA-NEXT: v_mov_b32_e32 v17, s9 +; HSA-NEXT: v_mov_b32_e32 v18, s10 +; HSA-NEXT: v_mov_b32_e32 v19, s11 +; HSA-NEXT: v_mov_b32_e32 v20, s12 +; HSA-NEXT: v_mov_b32_e32 v21, s13 +; HSA-NEXT: v_mov_b32_e32 v22, s14 +; HSA-NEXT: v_mov_b32_e32 v23, s15 +; HSA-NEXT: v_mov_b32_e32 v24, s16 +; HSA-NEXT: v_mov_b32_e32 v25, s17 +; HSA-NEXT: v_mov_b32_e32 v26, s18 +; HSA-NEXT: v_mov_b32_e32 v27, s19 +; HSA-NEXT: v_mov_b32_e32 v28, s20 +; HSA-NEXT: v_mov_b32_e32 v29, s21 +; HSA-NEXT: v_mov_b32_e32 v30, s22 +; HSA-NEXT: s_getpc_b64 s[24:25] +; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25] +; HSA-NEXT: s_endpgm entry: call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) ret void } -; GCN-LABEL: {{^}}tail_call_byval_align16: -; GCN-NOT: s32 -; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:28 -; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32{{$}} - -; GCN: s_getpc_b64 - -; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:20 -; GCN: buffer_load_dword [[VREG3:v[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} -; GCN: buffer_store_dword [[VREG3]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { +; VI-LABEL: tail_call_byval_align16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; VI-NEXT: s_setpc_b64 s[4:5] +; +; CI-LABEL: tail_call_byval_align16: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; CI-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: tail_call_byval_align16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX11-LABEL: tail_call_byval_align16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v31, s32 +; GFX11-NEXT: scratch_load_b64 v[31:32], off, s32 offset:24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[31:32], s32 offset:16 +; GFX11-NEXT: s_setpc_b64 s[0:1] +; +; HSA-LABEL: tail_call_byval_align16: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; HSA-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca double, align 8, addrspace(5) tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca) ret void } -; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: -; GCN-NOT: s32 -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}} -; GCN: s_getpc_b64 -; GCN: buffer_store_dword v31, off, s[0:3], s32{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GCN-NOT: s32 -; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { +; VI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: s_setpc_b64 s[4:5] +; +; CI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; CI-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX11-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 +; GFX11-NEXT: scratch_load_b64 v[31:32], off, s32 offset:4 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b32 off, v33, s32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[31:32], s32 offset:4 +; GFX11-NEXT: s_setpc_b64 s[0:1] +; +; HSA-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; HSA-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_setpc_b64 s[4:5] entry: tail call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) ret void } -; GCN-LABEL: {{^}}stack_12xv3i32: -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 -; GCN: s_getpc define void @stack_12xv3i32() #0 { +; VI-LABEL: stack_12xv3i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 15 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 1 +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, 1 +; VI-NEXT: v_mov_b32_e32 v6, 2 +; VI-NEXT: v_mov_b32_e32 v7, 2 +; VI-NEXT: v_mov_b32_e32 v8, 2 +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_mov_b32_e32 v10, 3 +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_mov_b32_e32 v12, 4 +; VI-NEXT: v_mov_b32_e32 v13, 4 +; VI-NEXT: v_mov_b32_e32 v14, 4 +; VI-NEXT: v_mov_b32_e32 v15, 5 +; VI-NEXT: v_mov_b32_e32 v16, 5 +; VI-NEXT: v_mov_b32_e32 v17, 5 +; VI-NEXT: v_mov_b32_e32 v18, 6 +; VI-NEXT: v_mov_b32_e32 v19, 6 +; VI-NEXT: v_mov_b32_e32 v20, 6 +; VI-NEXT: v_mov_b32_e32 v21, 7 +; VI-NEXT: v_mov_b32_e32 v22, 7 +; VI-NEXT: v_mov_b32_e32 v23, 7 +; VI-NEXT: v_mov_b32_e32 v24, 8 +; VI-NEXT: v_mov_b32_e32 v25, 8 +; VI-NEXT: v_mov_b32_e32 v26, 8 +; VI-NEXT: v_mov_b32_e32 v27, 9 +; VI-NEXT: v_mov_b32_e32 v28, 9 +; VI-NEXT: v_mov_b32_e32 v29, 9 +; VI-NEXT: v_mov_b32_e32 v30, 10 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: stack_12xv3i32: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s33 +; CI-NEXT: s_mov_b32 s33, s32 +; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CI-NEXT: s_mov_b64 exec, s[8:9] +; CI-NEXT: s_addk_i32 s32, 0x400 +; CI-NEXT: v_mov_b32_e32 v0, 11 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CI-NEXT: v_mov_b32_e32 v0, 12 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; CI-NEXT: v_mov_b32_e32 v0, 13 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; CI-NEXT: v_mov_b32_e32 v0, 14 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; CI-NEXT: v_mov_b32_e32 v0, 15 +; CI-NEXT: v_writelane_b32 v40, s30, 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v3, 1 +; CI-NEXT: v_mov_b32_e32 v4, 1 +; CI-NEXT: v_mov_b32_e32 v5, 1 +; CI-NEXT: v_mov_b32_e32 v6, 2 +; CI-NEXT: v_mov_b32_e32 v7, 2 +; CI-NEXT: v_mov_b32_e32 v8, 2 +; CI-NEXT: v_mov_b32_e32 v9, 3 +; CI-NEXT: v_mov_b32_e32 v10, 3 +; CI-NEXT: v_mov_b32_e32 v11, 3 +; CI-NEXT: v_mov_b32_e32 v12, 4 +; CI-NEXT: v_mov_b32_e32 v13, 4 +; CI-NEXT: v_mov_b32_e32 v14, 4 +; CI-NEXT: v_mov_b32_e32 v15, 5 +; CI-NEXT: v_mov_b32_e32 v16, 5 +; CI-NEXT: v_mov_b32_e32 v17, 5 +; CI-NEXT: v_mov_b32_e32 v18, 6 +; CI-NEXT: v_mov_b32_e32 v19, 6 +; CI-NEXT: v_mov_b32_e32 v20, 6 +; CI-NEXT: v_mov_b32_e32 v21, 7 +; CI-NEXT: v_mov_b32_e32 v22, 7 +; CI-NEXT: v_mov_b32_e32 v23, 7 +; CI-NEXT: v_mov_b32_e32 v24, 8 +; CI-NEXT: v_mov_b32_e32 v25, 8 +; CI-NEXT: v_mov_b32_e32 v26, 8 +; CI-NEXT: v_mov_b32_e32 v27, 9 +; CI-NEXT: v_mov_b32_e32 v28, 9 +; CI-NEXT: v_mov_b32_e32 v29, 9 +; CI-NEXT: v_mov_b32_e32 v30, 10 +; CI-NEXT: v_writelane_b32 v41, s4, 0 +; CI-NEXT: v_writelane_b32 v40, s31, 1 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: v_readlane_b32 s31, v40, 1 +; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CI-NEXT: s_mov_b64 exec, s[6:7] +; CI-NEXT: s_addk_i32 s32, 0xfc00 +; CI-NEXT: s_mov_b32 s33, s4 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_12xv3i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 1 +; GFX9-NEXT: v_mov_b32_e32 v6, 2 +; GFX9-NEXT: v_mov_b32_e32 v7, 2 +; GFX9-NEXT: v_mov_b32_e32 v8, 2 +; GFX9-NEXT: v_mov_b32_e32 v9, 3 +; GFX9-NEXT: v_mov_b32_e32 v10, 3 +; GFX9-NEXT: v_mov_b32_e32 v11, 3 +; GFX9-NEXT: v_mov_b32_e32 v12, 4 +; GFX9-NEXT: v_mov_b32_e32 v13, 4 +; GFX9-NEXT: v_mov_b32_e32 v14, 4 +; GFX9-NEXT: v_mov_b32_e32 v15, 5 +; GFX9-NEXT: v_mov_b32_e32 v16, 5 +; GFX9-NEXT: v_mov_b32_e32 v17, 5 +; GFX9-NEXT: v_mov_b32_e32 v18, 6 +; GFX9-NEXT: v_mov_b32_e32 v19, 6 +; GFX9-NEXT: v_mov_b32_e32 v20, 6 +; GFX9-NEXT: v_mov_b32_e32 v21, 7 +; GFX9-NEXT: v_mov_b32_e32 v22, 7 +; GFX9-NEXT: v_mov_b32_e32 v23, 7 +; GFX9-NEXT: v_mov_b32_e32 v24, 8 +; GFX9-NEXT: v_mov_b32_e32 v25, 8 +; GFX9-NEXT: v_mov_b32_e32 v26, 8 +; GFX9-NEXT: v_mov_b32_e32 v27, 9 +; GFX9-NEXT: v_mov_b32_e32 v28, 9 +; GFX9-NEXT: v_mov_b32_e32 v29, 9 +; GFX9-NEXT: v_mov_b32_e32 v30, 10 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_12xv3i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12 +; GFX11-NEXT: v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14 +; GFX11-NEXT: v_mov_b32_e32 v4, 15 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b32 off, v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v3, 1 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 1 :: v_dual_mov_b32 v4, 1 +; GFX11-NEXT: v_dual_mov_b32 v7, 2 :: v_dual_mov_b32 v6, 2 +; GFX11-NEXT: v_dual_mov_b32 v9, 3 :: v_dual_mov_b32 v8, 2 +; GFX11-NEXT: v_dual_mov_b32 v11, 3 :: v_dual_mov_b32 v10, 3 +; GFX11-NEXT: v_dual_mov_b32 v13, 4 :: v_dual_mov_b32 v12, 4 +; GFX11-NEXT: v_dual_mov_b32 v15, 5 :: v_dual_mov_b32 v14, 4 +; GFX11-NEXT: v_dual_mov_b32 v17, 5 :: v_dual_mov_b32 v16, 5 +; GFX11-NEXT: v_dual_mov_b32 v19, 6 :: v_dual_mov_b32 v18, 6 +; GFX11-NEXT: v_dual_mov_b32 v21, 7 :: v_dual_mov_b32 v20, 6 +; GFX11-NEXT: v_dual_mov_b32 v23, 7 :: v_dual_mov_b32 v22, 7 +; GFX11-NEXT: v_dual_mov_b32 v25, 8 :: v_dual_mov_b32 v24, 8 +; GFX11-NEXT: v_dual_mov_b32 v27, 9 :: v_dual_mov_b32 v26, 8 +; GFX11-NEXT: v_dual_mov_b32 v29, 9 :: v_dual_mov_b32 v28, 9 +; GFX11-NEXT: v_mov_b32_e32 v30, 10 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_12xv3i32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 11 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 12 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 13 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 14 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 15 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 1 +; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: v_mov_b32_e32 v5, 1 +; HSA-NEXT: v_mov_b32_e32 v6, 2 +; HSA-NEXT: v_mov_b32_e32 v7, 2 +; HSA-NEXT: v_mov_b32_e32 v8, 2 +; HSA-NEXT: v_mov_b32_e32 v9, 3 +; HSA-NEXT: v_mov_b32_e32 v10, 3 +; HSA-NEXT: v_mov_b32_e32 v11, 3 +; HSA-NEXT: v_mov_b32_e32 v12, 4 +; HSA-NEXT: v_mov_b32_e32 v13, 4 +; HSA-NEXT: v_mov_b32_e32 v14, 4 +; HSA-NEXT: v_mov_b32_e32 v15, 5 +; HSA-NEXT: v_mov_b32_e32 v16, 5 +; HSA-NEXT: v_mov_b32_e32 v17, 5 +; HSA-NEXT: v_mov_b32_e32 v18, 6 +; HSA-NEXT: v_mov_b32_e32 v19, 6 +; HSA-NEXT: v_mov_b32_e32 v20, 6 +; HSA-NEXT: v_mov_b32_e32 v21, 7 +; HSA-NEXT: v_mov_b32_e32 v22, 7 +; HSA-NEXT: v_mov_b32_e32 v23, 7 +; HSA-NEXT: v_mov_b32_e32 v24, 8 +; HSA-NEXT: v_mov_b32_e32 v25, 8 +; HSA-NEXT: v_mov_b32_e32 v26, 8 +; HSA-NEXT: v_mov_b32_e32 v27, 9 +; HSA-NEXT: v_mov_b32_e32 v28, 9 +; HSA-NEXT: v_mov_b32_e32 v29, 9 +; HSA-NEXT: v_mov_b32_e32 v30, 10 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_12xv3i32( <3 x i32>, @@ -804,19 +6170,345 @@ ret void } -; GCN-LABEL: {{^}}stack_12xv3f32: -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 -; GCN: s_getpc define void @stack_12xv3f32() #0 { +; VI-LABEL: stack_12xv3f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 0x41300000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 0x41400000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 1.0 +; VI-NEXT: v_mov_b32_e32 v4, 1.0 +; VI-NEXT: v_mov_b32_e32 v5, 1.0 +; VI-NEXT: v_mov_b32_e32 v6, 2.0 +; VI-NEXT: v_mov_b32_e32 v7, 2.0 +; VI-NEXT: v_mov_b32_e32 v8, 2.0 +; VI-NEXT: v_mov_b32_e32 v9, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v10, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v11, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v12, 4.0 +; VI-NEXT: v_mov_b32_e32 v13, 4.0 +; VI-NEXT: v_mov_b32_e32 v14, 4.0 +; VI-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; VI-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; VI-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; VI-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; VI-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; VI-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; VI-NEXT: v_mov_b32_e32 v24, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v25, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v26, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v27, 0x41100000 +; VI-NEXT: v_mov_b32_e32 v28, 0x41100000 +; VI-NEXT: v_mov_b32_e32 v29, 0x41100000 +; VI-NEXT: v_mov_b32_e32 v30, 0x41200000 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: stack_12xv3f32: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s33 +; CI-NEXT: s_mov_b32 s33, s32 +; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CI-NEXT: s_mov_b64 exec, s[8:9] +; CI-NEXT: s_addk_i32 s32, 0x400 +; CI-NEXT: v_mov_b32_e32 v0, 0x41300000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CI-NEXT: v_mov_b32_e32 v0, 0x41400000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; CI-NEXT: v_mov_b32_e32 v0, 0x41500000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; CI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 +; CI-NEXT: v_writelane_b32 v40, s30, 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v3, 1.0 +; CI-NEXT: v_mov_b32_e32 v4, 1.0 +; CI-NEXT: v_mov_b32_e32 v5, 1.0 +; CI-NEXT: v_mov_b32_e32 v6, 2.0 +; CI-NEXT: v_mov_b32_e32 v7, 2.0 +; CI-NEXT: v_mov_b32_e32 v8, 2.0 +; CI-NEXT: v_mov_b32_e32 v9, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v10, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v11, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v12, 4.0 +; CI-NEXT: v_mov_b32_e32 v13, 4.0 +; CI-NEXT: v_mov_b32_e32 v14, 4.0 +; CI-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; CI-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; CI-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; CI-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; CI-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; CI-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; CI-NEXT: v_mov_b32_e32 v24, 0x41000000 +; CI-NEXT: v_mov_b32_e32 v25, 0x41000000 +; CI-NEXT: v_mov_b32_e32 v26, 0x41000000 +; CI-NEXT: v_mov_b32_e32 v27, 0x41100000 +; CI-NEXT: v_mov_b32_e32 v28, 0x41100000 +; CI-NEXT: v_mov_b32_e32 v29, 0x41100000 +; CI-NEXT: v_mov_b32_e32 v30, 0x41200000 +; CI-NEXT: v_writelane_b32 v41, s4, 0 +; CI-NEXT: v_writelane_b32 v40, s31, 1 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: v_readlane_b32 s31, v40, 1 +; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CI-NEXT: s_mov_b64 exec, s[6:7] +; CI-NEXT: s_addk_i32 s32, 0xfc00 +; CI-NEXT: s_mov_b32 s33, s4 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_12xv3f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v7, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v11, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v12, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v13, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v14, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; GFX9-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; GFX9-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; GFX9-NEXT: v_mov_b32_e32 v24, 0x41000000 +; GFX9-NEXT: v_mov_b32_e32 v25, 0x41000000 +; GFX9-NEXT: v_mov_b32_e32 v26, 0x41000000 +; GFX9-NEXT: v_mov_b32_e32 v27, 0x41100000 +; GFX9-NEXT: v_mov_b32_e32 v28, 0x41100000 +; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000 +; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_12xv3f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41400000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x41600000 +; GFX11-NEXT: v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b32 off, v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v6, 2.0 :: v_dual_mov_b32 v9, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v11, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v10, 0x40400000 :: v_dual_mov_b32 v13, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v12, 4.0 :: v_dual_mov_b32 v15, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v14, 4.0 :: v_dual_mov_b32 v17, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v18, 0x40c00000 :: v_dual_mov_b32 v19, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; GFX11-NEXT: v_dual_mov_b32 v21, 0x40e00000 :: v_dual_mov_b32 v22, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; GFX11-NEXT: v_dual_mov_b32 v24, 0x41000000 :: v_dual_mov_b32 v25, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v26, 0x41000000 +; GFX11-NEXT: v_dual_mov_b32 v27, 0x41100000 :: v_dual_mov_b32 v28, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v29, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v30, 0x41200000 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_12xv3f32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41400000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 1.0 +; HSA-NEXT: v_mov_b32_e32 v4, 1.0 +; HSA-NEXT: v_mov_b32_e32 v5, 1.0 +; HSA-NEXT: v_mov_b32_e32 v6, 2.0 +; HSA-NEXT: v_mov_b32_e32 v7, 2.0 +; HSA-NEXT: v_mov_b32_e32 v8, 2.0 +; HSA-NEXT: v_mov_b32_e32 v9, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v10, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v11, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v12, 4.0 +; HSA-NEXT: v_mov_b32_e32 v13, 4.0 +; HSA-NEXT: v_mov_b32_e32 v14, 4.0 +; HSA-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; HSA-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; HSA-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; HSA-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; HSA-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; HSA-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; HSA-NEXT: v_mov_b32_e32 v24, 0x41000000 +; HSA-NEXT: v_mov_b32_e32 v25, 0x41000000 +; HSA-NEXT: v_mov_b32_e32 v26, 0x41000000 +; HSA-NEXT: v_mov_b32_e32 v27, 0x41100000 +; HSA-NEXT: v_mov_b32_e32 v28, 0x41100000 +; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000 +; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_12xv3f32( <3 x float>, @@ -834,27 +6526,378 @@ ret void } -; GCN-LABEL: {{^}}stack_8xv5i32: -; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 7 -; GCN: buffer_store_dword [[REG7]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 -; GCN: s_getpc define void @stack_8xv5i32() #0 { +; VI-LABEL: stack_8xv5i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; VI-NEXT: v_mov_b32_e32 v0, 13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; VI-NEXT: v_mov_b32_e32 v0, 14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; VI-NEXT: v_mov_b32_e32 v0, 15 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 1 +; VI-NEXT: v_mov_b32_e32 v6, 1 +; VI-NEXT: v_mov_b32_e32 v7, 1 +; VI-NEXT: v_mov_b32_e32 v8, 1 +; VI-NEXT: v_mov_b32_e32 v9, 1 +; VI-NEXT: v_mov_b32_e32 v10, 2 +; VI-NEXT: v_mov_b32_e32 v11, 2 +; VI-NEXT: v_mov_b32_e32 v12, 2 +; VI-NEXT: v_mov_b32_e32 v13, 2 +; VI-NEXT: v_mov_b32_e32 v14, 2 +; VI-NEXT: v_mov_b32_e32 v15, 3 +; VI-NEXT: v_mov_b32_e32 v16, 3 +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_mov_b32_e32 v18, 3 +; VI-NEXT: v_mov_b32_e32 v19, 3 +; VI-NEXT: v_mov_b32_e32 v20, 4 +; VI-NEXT: v_mov_b32_e32 v21, 4 +; VI-NEXT: v_mov_b32_e32 v22, 4 +; VI-NEXT: v_mov_b32_e32 v23, 4 +; VI-NEXT: v_mov_b32_e32 v24, 4 +; VI-NEXT: v_mov_b32_e32 v25, 5 +; VI-NEXT: v_mov_b32_e32 v26, 5 +; VI-NEXT: v_mov_b32_e32 v27, 5 +; VI-NEXT: v_mov_b32_e32 v28, 5 +; VI-NEXT: v_mov_b32_e32 v29, 5 +; VI-NEXT: v_mov_b32_e32 v30, 6 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: stack_8xv5i32: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s33 +; CI-NEXT: s_mov_b32 s33, s32 +; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CI-NEXT: s_mov_b64 exec, s[8:9] +; CI-NEXT: s_addk_i32 s32, 0x400 +; CI-NEXT: v_mov_b32_e32 v0, 7 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CI-NEXT: v_mov_b32_e32 v0, 8 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; CI-NEXT: v_mov_b32_e32 v0, 9 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; CI-NEXT: v_mov_b32_e32 v0, 10 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; CI-NEXT: v_mov_b32_e32 v0, 11 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CI-NEXT: v_mov_b32_e32 v0, 12 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; CI-NEXT: v_mov_b32_e32 v0, 13 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; CI-NEXT: v_mov_b32_e32 v0, 14 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; CI-NEXT: v_mov_b32_e32 v0, 15 +; CI-NEXT: v_writelane_b32 v40, s30, 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v5, 1 +; CI-NEXT: v_mov_b32_e32 v6, 1 +; CI-NEXT: v_mov_b32_e32 v7, 1 +; CI-NEXT: v_mov_b32_e32 v8, 1 +; CI-NEXT: v_mov_b32_e32 v9, 1 +; CI-NEXT: v_mov_b32_e32 v10, 2 +; CI-NEXT: v_mov_b32_e32 v11, 2 +; CI-NEXT: v_mov_b32_e32 v12, 2 +; CI-NEXT: v_mov_b32_e32 v13, 2 +; CI-NEXT: v_mov_b32_e32 v14, 2 +; CI-NEXT: v_mov_b32_e32 v15, 3 +; CI-NEXT: v_mov_b32_e32 v16, 3 +; CI-NEXT: v_mov_b32_e32 v17, 3 +; CI-NEXT: v_mov_b32_e32 v18, 3 +; CI-NEXT: v_mov_b32_e32 v19, 3 +; CI-NEXT: v_mov_b32_e32 v20, 4 +; CI-NEXT: v_mov_b32_e32 v21, 4 +; CI-NEXT: v_mov_b32_e32 v22, 4 +; CI-NEXT: v_mov_b32_e32 v23, 4 +; CI-NEXT: v_mov_b32_e32 v24, 4 +; CI-NEXT: v_mov_b32_e32 v25, 5 +; CI-NEXT: v_mov_b32_e32 v26, 5 +; CI-NEXT: v_mov_b32_e32 v27, 5 +; CI-NEXT: v_mov_b32_e32 v28, 5 +; CI-NEXT: v_mov_b32_e32 v29, 5 +; CI-NEXT: v_mov_b32_e32 v30, 6 +; CI-NEXT: v_writelane_b32 v41, s4, 0 +; CI-NEXT: v_writelane_b32 v40, s31, 1 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: v_readlane_b32 s31, v40, 1 +; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CI-NEXT: s_mov_b64 exec, s[6:7] +; CI-NEXT: s_addk_i32 s32, 0xfc00 +; CI-NEXT: s_mov_b32 s33, s4 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_8xv5i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1 +; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 1 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v10, 2 +; GFX9-NEXT: v_mov_b32_e32 v11, 2 +; GFX9-NEXT: v_mov_b32_e32 v12, 2 +; GFX9-NEXT: v_mov_b32_e32 v13, 2 +; GFX9-NEXT: v_mov_b32_e32 v14, 2 +; GFX9-NEXT: v_mov_b32_e32 v15, 3 +; GFX9-NEXT: v_mov_b32_e32 v16, 3 +; GFX9-NEXT: v_mov_b32_e32 v17, 3 +; GFX9-NEXT: v_mov_b32_e32 v18, 3 +; GFX9-NEXT: v_mov_b32_e32 v19, 3 +; GFX9-NEXT: v_mov_b32_e32 v20, 4 +; GFX9-NEXT: v_mov_b32_e32 v21, 4 +; GFX9-NEXT: v_mov_b32_e32 v22, 4 +; GFX9-NEXT: v_mov_b32_e32 v23, 4 +; GFX9-NEXT: v_mov_b32_e32 v24, 4 +; GFX9-NEXT: v_mov_b32_e32 v25, 5 +; GFX9-NEXT: v_mov_b32_e32 v26, 5 +; GFX9-NEXT: v_mov_b32_e32 v27, 5 +; GFX9-NEXT: v_mov_b32_e32 v28, 5 +; GFX9-NEXT: v_mov_b32_e32 v29, 5 +; GFX9-NEXT: v_mov_b32_e32 v30, 6 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_8xv5i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10 +; GFX11-NEXT: v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14 +; GFX11-NEXT: v_mov_b32_e32 v6, 13 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 32 +; GFX11-NEXT: s_add_i32 s1, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: scratch_store_b32 off, v8, s0 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 1 +; GFX11-NEXT: v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v9, 1 +; GFX11-NEXT: v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v11, 2 +; GFX11-NEXT: v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v13, 2 +; GFX11-NEXT: v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v15, 3 +; GFX11-NEXT: v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v17, 3 +; GFX11-NEXT: v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v19, 3 +; GFX11-NEXT: v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v21, 4 +; GFX11-NEXT: v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v23, 4 +; GFX11-NEXT: v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v25, 5 +; GFX11-NEXT: v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v27, 5 +; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5 +; GFX11-NEXT: v_mov_b32_e32 v28, 5 +; GFX11-NEXT: v_mov_b32_e32 v30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_8xv5i32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 7 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 9 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 10 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 11 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 12 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; HSA-NEXT: v_mov_b32_e32 v0, 13 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; HSA-NEXT: v_mov_b32_e32 v0, 14 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; HSA-NEXT: v_mov_b32_e32 v0, 15 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 0 +; HSA-NEXT: v_mov_b32_e32 v5, 1 +; HSA-NEXT: v_mov_b32_e32 v6, 1 +; HSA-NEXT: v_mov_b32_e32 v7, 1 +; HSA-NEXT: v_mov_b32_e32 v8, 1 +; HSA-NEXT: v_mov_b32_e32 v9, 1 +; HSA-NEXT: v_mov_b32_e32 v10, 2 +; HSA-NEXT: v_mov_b32_e32 v11, 2 +; HSA-NEXT: v_mov_b32_e32 v12, 2 +; HSA-NEXT: v_mov_b32_e32 v13, 2 +; HSA-NEXT: v_mov_b32_e32 v14, 2 +; HSA-NEXT: v_mov_b32_e32 v15, 3 +; HSA-NEXT: v_mov_b32_e32 v16, 3 +; HSA-NEXT: v_mov_b32_e32 v17, 3 +; HSA-NEXT: v_mov_b32_e32 v18, 3 +; HSA-NEXT: v_mov_b32_e32 v19, 3 +; HSA-NEXT: v_mov_b32_e32 v20, 4 +; HSA-NEXT: v_mov_b32_e32 v21, 4 +; HSA-NEXT: v_mov_b32_e32 v22, 4 +; HSA-NEXT: v_mov_b32_e32 v23, 4 +; HSA-NEXT: v_mov_b32_e32 v24, 4 +; HSA-NEXT: v_mov_b32_e32 v25, 5 +; HSA-NEXT: v_mov_b32_e32 v26, 5 +; HSA-NEXT: v_mov_b32_e32 v27, 5 +; HSA-NEXT: v_mov_b32_e32 v28, 5 +; HSA-NEXT: v_mov_b32_e32 v29, 5 +; HSA-NEXT: v_mov_b32_e32 v30, 6 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_8xv5i32( <5 x i32>, @@ -868,27 +6911,381 @@ ret void } -; GCN-LABEL: {{^}}stack_8xv5f32: -; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 0x40e00000 -; GCN: buffer_store_dword [[REG7]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 -; GCN: s_getpc define void @stack_8xv5f32() #0 { +; VI-LABEL: stack_8xv5f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 0x41000000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 0x41100000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 0x41300000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 0x41400000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 1.0 +; VI-NEXT: v_mov_b32_e32 v6, 1.0 +; VI-NEXT: v_mov_b32_e32 v7, 1.0 +; VI-NEXT: v_mov_b32_e32 v8, 1.0 +; VI-NEXT: v_mov_b32_e32 v9, 1.0 +; VI-NEXT: v_mov_b32_e32 v10, 2.0 +; VI-NEXT: v_mov_b32_e32 v11, 2.0 +; VI-NEXT: v_mov_b32_e32 v12, 2.0 +; VI-NEXT: v_mov_b32_e32 v13, 2.0 +; VI-NEXT: v_mov_b32_e32 v14, 2.0 +; VI-NEXT: v_mov_b32_e32 v15, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v16, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v18, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v19, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v20, 4.0 +; VI-NEXT: v_mov_b32_e32 v21, 4.0 +; VI-NEXT: v_mov_b32_e32 v22, 4.0 +; VI-NEXT: v_mov_b32_e32 v23, 4.0 +; VI-NEXT: v_mov_b32_e32 v24, 4.0 +; VI-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: stack_8xv5f32: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s4, s33 +; CI-NEXT: s_mov_b32 s33, s32 +; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CI-NEXT: s_mov_b64 exec, s[8:9] +; CI-NEXT: s_addk_i32 s32, 0x400 +; CI-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; CI-NEXT: v_mov_b32_e32 v0, 0x41000000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; CI-NEXT: v_mov_b32_e32 v0, 0x41100000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; CI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; CI-NEXT: v_mov_b32_e32 v0, 0x41300000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CI-NEXT: v_mov_b32_e32 v0, 0x41400000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; CI-NEXT: v_mov_b32_e32 v0, 0x41500000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; CI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 +; CI-NEXT: v_writelane_b32 v40, s30, 0 +; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v5, 1.0 +; CI-NEXT: v_mov_b32_e32 v6, 1.0 +; CI-NEXT: v_mov_b32_e32 v7, 1.0 +; CI-NEXT: v_mov_b32_e32 v8, 1.0 +; CI-NEXT: v_mov_b32_e32 v9, 1.0 +; CI-NEXT: v_mov_b32_e32 v10, 2.0 +; CI-NEXT: v_mov_b32_e32 v11, 2.0 +; CI-NEXT: v_mov_b32_e32 v12, 2.0 +; CI-NEXT: v_mov_b32_e32 v13, 2.0 +; CI-NEXT: v_mov_b32_e32 v14, 2.0 +; CI-NEXT: v_mov_b32_e32 v15, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v16, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v17, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v18, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v19, 0x40400000 +; CI-NEXT: v_mov_b32_e32 v20, 4.0 +; CI-NEXT: v_mov_b32_e32 v21, 4.0 +; CI-NEXT: v_mov_b32_e32 v22, 4.0 +; CI-NEXT: v_mov_b32_e32 v23, 4.0 +; CI-NEXT: v_mov_b32_e32 v24, 4.0 +; CI-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; CI-NEXT: v_writelane_b32 v41, s4, 0 +; CI-NEXT: v_writelane_b32 v40, s31, 1 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CI-NEXT: v_readlane_b32 s31, v40, 1 +; CI-NEXT: v_readlane_b32 s30, v40, 0 +; CI-NEXT: v_readlane_b32 s4, v41, 0 +; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CI-NEXT: s_mov_b64 exec, s[6:7] +; CI-NEXT: s_addk_i32 s32, 0xfc00 +; CI-NEXT: s_mov_b32 s33, s4 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_8xv5f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41100000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41200000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v8, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v9, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v10, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v11, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v12, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v13, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v14, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v19, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v20, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v21, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v22, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v23, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v24, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_8xv5f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x41200000 +; GFX11-NEXT: v_mov_b32_e32 v8, 0x41700000 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_mov_b32_e32 v4, 0x41300000 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x41400000 +; GFX11-NEXT: v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v7, 0x41600000 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 32 +; GFX11-NEXT: s_add_i32 s1, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b32 off, v8, s0 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1 +; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_mov_b32 v8, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v11, 2.0 :: v_dual_mov_b32 v10, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v13, 2.0 :: v_dual_mov_b32 v12, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v15, 0x40400000 :: v_dual_mov_b32 v14, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v17, 0x40400000 :: v_dual_mov_b32 v16, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v19, 0x40400000 :: v_dual_mov_b32 v18, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v21, 4.0 :: v_dual_mov_b32 v20, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v23, 4.0 :: v_dual_mov_b32 v22, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v25, 0x40a00000 :: v_dual_mov_b32 v24, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v29, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_8xv5f32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41000000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41100000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41200000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41400000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 0 +; HSA-NEXT: v_mov_b32_e32 v5, 1.0 +; HSA-NEXT: v_mov_b32_e32 v6, 1.0 +; HSA-NEXT: v_mov_b32_e32 v7, 1.0 +; HSA-NEXT: v_mov_b32_e32 v8, 1.0 +; HSA-NEXT: v_mov_b32_e32 v9, 1.0 +; HSA-NEXT: v_mov_b32_e32 v10, 2.0 +; HSA-NEXT: v_mov_b32_e32 v11, 2.0 +; HSA-NEXT: v_mov_b32_e32 v12, 2.0 +; HSA-NEXT: v_mov_b32_e32 v13, 2.0 +; HSA-NEXT: v_mov_b32_e32 v14, 2.0 +; HSA-NEXT: v_mov_b32_e32 v15, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v16, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v17, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v18, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v19, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v20, 4.0 +; HSA-NEXT: v_mov_b32_e32 v21, 4.0 +; HSA-NEXT: v_mov_b32_e32 v22, 4.0 +; HSA-NEXT: v_mov_b32_e32 v23, 4.0 +; HSA-NEXT: v_mov_b32_e32 v24, 4.0 +; HSA-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_8xv5f32( <5 x float>, @@ -912,9 +7309,7 @@ <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0 + attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } - - - diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1,11 +1,39 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s ; Make sure we don't crash or assert on spir_kernel calling convention. -; GCN-LABEL: {{^}}kernel: -; GCN: s_endpgm define spir_kernel void @kernel(ptr addrspace(1) %out) { +; SI-LABEL: kernel: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: kernel: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: kernel: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: store i32 0, ptr addrspace(1) %out ret void @@ -20,324 +48,909 @@ ; ret void ; } -; GCN-LABEL: {{^}}ps_ret_cc_f16: -; SI: v_cvt_f16_f32_e32 v0, v0 -; SI: v_cvt_f32_f16_e32 v0, v0 -; SI: v_add_f32_e32 v0, 1.0, v0 - -; VI: v_add_f16_e32 v0, 1.0, v0 -; VI: ; return define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { +; SI-LABEL: ps_ret_cc_f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_ret_cc_f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_ret_cc_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } -; GCN-LABEL: {{^}}ps_ret_cc_inreg_f16: -; SI: v_cvt_f16_f32_e32 v0, s0 -; SI: v_cvt_f32_f16_e32 v0, v0 -; SI: v_add_f32_e32 v0, 1.0, v0 - -; VI: v_add_f16_e64 v0, s0, 1.0 -; VI: ; return define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { +; SI-LABEL: ps_ret_cc_inreg_f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_ret_cc_inreg_f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e64 v0, s0, 1.0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_ret_cc_inreg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e64 v0, s0, 1.0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } -; GCN-LABEL: {{^}}fastcc: -; GCN: v_add_f32_e32 v0, 4.0, v0 define fastcc float @fastcc(float %arg0) #0 { +; SIVI-LABEL: fastcc: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fastcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %add = fadd float %arg0, 4.0 ret float %add } -; GCN-LABEL: {{^}}coldcc: -; GCN: v_add_f32_e32 v0, 4.0, v0 define coldcc float @coldcc(float %arg0) #0 { +; SIVI-LABEL: coldcc: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: coldcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %add = fadd float %arg0, 4.0 ret float %add } -; GCN-LABEL: {{^}}call_coldcc: -; GCN: v_mov_b32_e32 v0, 1.0 -; GCN: s_swappc_b64 define amdgpu_kernel void @call_coldcc() #0 { +; SI-LABEL: call_coldcc: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: call_coldcc: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: s_mov_b64 s[2:3], s[90:91] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: call_coldcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) store float %val, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}call_fastcc: -; GCN: v_mov_b32_e32 v0, 1.0 -; GCN: s_swappc_b64 define amdgpu_kernel void @call_fastcc() #0 { +; SI-LABEL: call_fastcc: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: call_fastcc: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: s_mov_b64 s[2:3], s[90:91] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: call_fastcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) store float %val, ptr addrspace(1) undef ret void } ; Mesa compute shader: check for 47176 (COMPUTE_PGM_RSRC1) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 47176 -; GCN-LABEL: {{^}}cs_mesa: define amdgpu_cs half @cs_mesa(half %arg0) { +; SI-LABEL: cs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: cs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: cs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 45096 -; GCN-LABEL: {{^}}ps_mesa_f16: define amdgpu_ps half @ps_mesa_f16(half %arg0) { +; SI-LABEL: ps_mesa_f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 45352 -; GCN-LABEL: {{^}}vs_mesa: define amdgpu_vs half @vs_mesa(half %arg0) { +; SI-LABEL: vs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: vs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: vs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa geometry shader: check for 45608 (SPI_SHADER_PGM_RSRC1_GS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 45608 -; GCN-LABEL: {{^}}gs_mesa: define amdgpu_gs half @gs_mesa(half %arg0) { +; SI-LABEL: gs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: gs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: gs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa hull shader: check for 46120 (SPI_SHADER_PGM_RSRC1_HS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 46120 -; GCN-LABEL: {{^}}hs_mesa: define amdgpu_hs half @hs_mesa(half %arg0) { +; SI-LABEL: hs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: hs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: hs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; FIXME: Inconsistent ABI between targets -; GCN-LABEL: {{^}}ps_mesa_v2f16: -; VI: v_mov_b32_e32 v1, 0x3c00 -; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: ; return - -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0 -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1 -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]] -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]] -; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]] -; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]] -; SI: ; return to shader part epilog + define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { +; SI-LABEL: ps_mesa_v2f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_v2f16: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, ret <2 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16: -; VI: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 -; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e64 v1, s0, 1.0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: ; return to shader part epilog - -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0 -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1 -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]] -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]] -; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]] -; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]] -; SI: ; return to shader part epilog define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v2f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v2 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_inreg_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v1, s0, 1.0 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_inreg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, ret <2 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_v2i16: -; VI: v_mov_b32_e32 v2, 1 -; VI: v_add_u16_e32 v1, 1, v0 -; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_or_b32_e32 v0, v1, v0 - - -; SI: v_lshlrev_b32_e32 v1, 16, v1 -; SI: v_add_i32_e32 v0, vcc, 1, v0 -; SI: v_and_b32 -; SI: v_or_b32 -; SI: v_add_i32_e32 v0, vcc, 0x10000, v0 define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { +; SI-LABEL: ps_mesa_v2i16: +; SI: ; %bb.0: +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x10000, v0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v2i16: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v2, 1 +; VI-NEXT: v_add_u16_e32 v1, 1, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, store <2 x i16> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16: -; VI: s_and_b32 s1, s0, 0xffff0000 -; VI: s_add_i32 s0, s0, 1 -; VI: s_and_b32 s0, s0, 0xffff -; VI: s_or_b32 s0, s1, s0 -; VI: s_add_i32 s0, s0, 0x10000 -; VI: v_mov_b32_e32 v0, s0 - -; SI: s_lshl_b32 s1, s1, 16 -; SI: s_add_i32 s0, s0, 1 -; SI: s_and_b32 s0, s0, 0xffff -; SI: s_or_b32 s0, s1, s0 -; SI: s_add_i32 s0, s0, 0x10000 define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_add_i32 s0, s0, 1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_add_i32 s0, s0, 0x10000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_and_b32 s1, s0, 0xffff0000 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_add_i32 s0, s0, 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_sub_u16 v0, s0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, store <2 x i16> %add, ptr addrspace(1) undef ret void } ; FIXME: Differenet ABI for VI+ -; GCN-LABEL: {{^}}ps_mesa_v4f16: -; SI: v_cvt_f16_f32_e32 v3, v3 -; SI: v_cvt_f16_f32_e32 v2, v2 -; SI: v_cvt_f16_f32_e32 v1, v1 -; SI: v_cvt_f16_f32_e32 v0, v0 - -; VI: v_add_f16_e32 v2, 1.0, v1 -; VI: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_add_f16_e32 v4, 1.0, v0 -; VI: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { +; SI-LABEL: ps_mesa_v4f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_v4f16: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_add_f16_e32 v2, 1.0, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 1.0, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <4 x half> %arg0, ret <4 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_inreg_v4f16: -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s3 -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s2 -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s1 -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s0 - -; VI: v_add_f16_e64 -; VI: v_add_f16_sdwa -; VI: v_add_f16_e64 -; VI: v_add_f16_sdwa define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v4f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v4 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_inreg_v4f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e64 v1, s1, 1.0 +; VI-NEXT: s_lshr_b32 s1, s1, 16 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v0, s0, 1.0 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_inreg_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <4 x half> %arg0, ret <4 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_inreg_v3i32: -; GCN-DAG: s_add_i32 s0, s0, 1 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3 define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s1, s1, 2 +; SI-NEXT: s_add_i32 s0, s0, 1 +; SI-NEXT: s_add_i32 s4, s2, 3 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s2, s2, 3 +; VI-NEXT: s_add_i32 s1, s1, 2 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: s_add_i32 s1, s1, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, store <3 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v3f32: -; GCN-DAG: v_add_f32{{.*}}, s0, 1.0 -; GCN-DAG: v_add_f32{{.*}}, s1, 2.0 -; GCN-DAG: v_add_f32{{.*}}, s2, 4.0 define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v3f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v3f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, store <3 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v5i32: -; GCN-DAG: s_add_i32 s0, s0, 1 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s3, 4 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s4, 5 define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s5, s3, 4 +; SI-NEXT: s_add_i32 s6, s2, 3 +; SI-NEXT: s_add_i32 s1, s1, 2 +; SI-NEXT: s_add_i32 s0, s0, 1 +; SI-NEXT: s_add_i32 s4, s4, 5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v5i32: +; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s4, s4, 5 +; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: s_add_i32 s2, s2, 3 +; VI-NEXT: s_add_i32 s1, s1, 2 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_i32 s3, s3, 4 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 2 +; GFX11-NEXT: s_add_i32 s4, s4, 5 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, store <5 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v5f32: -; GCN-DAG: v_add_f32{{.*}}, s0, 1.0 -; GCN-DAG: v_add_f32{{.*}}, s1, 2.0 -; GCN-DAG: v_add_f32{{.*}}, s2, 4.0 -; GCN-DAG: v_add_f32{{.*}}, s3, -1.0 -; GCN-DAG: v_add_f32{{.*}}, s4, 0.5 define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v5f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e64 v3, s3, -1.0 +; SI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; SI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s4, 0.5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v5f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e64 v3, s3, -1.0 +; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s4, 0.5 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v5f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 0.5 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, store <5 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v3i32: -; GCN-DAG: v_add_{{.*}}, 1, v0 -; GCN-DAG: v_add_{{.*}}, 2, v1 -; GCN-DAG: v_add_{{.*}}, 3, v2 define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { +; SI-LABEL: ps_mesa_v3i32: +; SI: ; %bb.0: +; SI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v3i32: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, store <3 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v3f32: -; GCN-DAG: v_add_{{.*}}, 1.0, v0 -; GCN-DAG: v_add_{{.*}}, 2.0, v1 -; GCN-DAG: v_add_{{.*}}, 4.0, v2 define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { +; SI-LABEL: ps_mesa_v3f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v3f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, store <3 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v5i32: -; GCN-DAG: v_add_{{.*}}, 1, v0 -; GCN-DAG: v_add_{{.*}}, 2, v1 -; GCN-DAG: v_add_{{.*}}, 3, v2 -; GCN-DAG: v_add_{{.*}}, 4, v3 -; GCN-DAG: v_add_{{.*}}, 5, v4 define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { +; SI-LABEL: ps_mesa_v5i32: +; SI: ; %bb.0: +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 5, v4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v5i32: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v4 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 5, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, store <5 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v5f32: -; GCN-DAG: v_add_f32{{.*}}, 1.0, v0 -; GCN-DAG: v_add_f32{{.*}}, 2.0, v1 -; GCN-DAG: v_add_f32{{.*}}, 4.0, v2 -; GCN-DAG: v_add_f32{{.*}}, -1.0, v3 -; GCN-DAG: v_add_f32{{.*}}, 0.5, v4 define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { +; SI-LABEL: ps_mesa_v5f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e32 v3, -1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0.5, v4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v5f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e32 v3, -1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v4, 0.5, v4 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v5f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, store <5 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_i16: -; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v0, v0 -; VI: v_add_u16_e32 v{{[0-9]+}}, v0, v0 define amdgpu_ps void @ps_mesa_i16(i16 %arg0) { +; SI-LABEL: ps_mesa_i16: +; SI: ; %bb.0: +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_i16: +; VI: ; %bb.0: +; VI-NEXT: v_add_u16_e32 v0, v0, v0 +; VI-NEXT: flat_store_short v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_i16: -; GCN: s_add_i32 s{{[0-9]+}}, s0, s0 define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_i16: +; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s0, s0, s0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_i16: +; VI: ; %bb.0: +; VI-NEXT: s_and_b32 s0, 0xffff, s0 +; VI-NEXT: s_add_i32 s0, s0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_short v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ret_ps_mesa_i16: -; GCN: s_movk_i32 s0, 0x7b define amdgpu_ps i16 @ret_ps_mesa_i16() { +; GCN-LABEL: ret_ps_mesa_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_movk_i32 s0, 0x7b +; GCN-NEXT: ; return to shader part epilog ret i16 123 } diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -1,14 +1,54 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s -; GCN-LABEL: {{^}}extract_vector_elt_v2f16: -; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 -; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] -; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN-DAG: buffer_store_short [[VELT0]] -; GCN-DAG: buffer_store_short [[VELT1]] define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 { +; SI-LABEL: extract_vector_elt_v2f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s4, s[2:3], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s5, s4, 16 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s5, s4, 16 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %p0 = extractelement <2 x half> %vec, i32 0 %p1 = extractelement <2 x half> %vec, i32 1 @@ -18,34 +58,124 @@ ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_sgpr: -; GCN: s_load_dword [[IDX:s[0-9]+]] -; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4 -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]] -; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN: buffer_store_short [[VELT1]] -; GCN: ScratchSize: 0 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 { +; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s0, s[0:1], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s1, s[6:7], 0x0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_lshl_b32 s0, s0, 4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s0, s1, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s6, s[6:7], 0x0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_lshl_b32 s4, s8, 4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s4, s6, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr %elt = extractelement <2 x half> %vec, i32 %idx store half %elt, ptr addrspace(1) %out, align 2 ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_vgpr: -; GCN-DAG: s_load_dword [[VEC:s[0-9]+]] -; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]] -; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]] - -; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]] -; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]] - - -; SI: buffer_store_short [[ELT]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]] -; GCN: ScratchSize: 0{{$}} define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 { +; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_load_dword s6, s[6:7], 0x0 +; SI-NEXT: s_mov_b64 s[0:1], s[4:5] +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_lshr_b32_e32 v0, s6, v0 +; SI-NEXT: buffer_store_short v0, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s1, s[2:3], 0x0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s1 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext @@ -57,12 +187,50 @@ ret void } -; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; GCN: s_load_dwordx4 - -; GCN: buffer_store_short -; GCN: buffer_store_short define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 { +; SI-LABEL: extract_vector_elt_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; SI-NEXT: s_endpgm +; +; VI-LABEL: extract_vector_elt_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 @@ -72,35 +240,106 @@ } ; FIXME: Why sometimes vector shift? -; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: -; SI: s_load_dword s -; SI: s_load_dwordx4 s - -; GFX89: s_load_dwordx4 s -; GFX89: s_load_dword s - - -; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4 -; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} - -; GCN: {{buffer|global}}_store_short define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 { +; SI-LABEL: dynamic_extract_vector_elt_v3f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s4, s4, 4 +; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: dynamic_extract_vector_elt_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_lshl_b32 s4, s8, 4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 store half %p0, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}v_extractelement_v4f16_2: -; SI: buffer_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: buffer_store_short [[LOAD]] - -; VI: flat_load_dword v -; VI: flat_store_short - -; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4 -; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]] define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_extractelement_v4f16_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_extractelement_v4f16_2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v2, v[1:2] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_extractelement_v4f16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -111,17 +350,69 @@ ret void } -; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr: -; GCN-DAG: {{flat|global|buffer}}_load_dword [[IDX:v[0-9]+]], -; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] - -; GFX89: v_lshrrev_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], [[SCALED_IDX]], v[[[LO]]:[[HI]]] -; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]] - -; SI: v_lshr_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], v[[[LO]]:[[HI]]], [[SCALED_IDX]] -; SI: buffer_store_short v[[SHIFT_LO]] define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v5, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], v0 +; SI-NEXT: buffer_store_short v3, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b64 v[0:1], v0, v[1:2] +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; VI-NEXT: flat_store_short v[1:2], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -133,13 +424,58 @@ ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0 -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 { +; SI-LABEL: reduce_load_vector_v8f16_extract_01: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: reduce_load_vector_v8f16_extract_01: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: reduce_load_vector_v8f16_extract_01: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr %elt0 = extractelement <16 x half> %load, i32 0 %elt1 = extractelement <16 x half> %load, i32 1 @@ -148,13 +484,58 @@ ret void } -; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23: -; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]], -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}} -; GCN-NOT: {{s|buffer|flat|global}}_load_ -; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 { +; SI-LABEL: reduce_load_vector_v8f16_extract_23: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dword s0, s[0:1], 0x1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s1, s0, 16 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: reduce_load_vector_v8f16_extract_23: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dword s0, s[0:1], 0x4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: reduce_load_vector_v8f16_extract_23: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s0, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr %elt2 = extractelement <16 x half> %load, i32 2 %elt3 = extractelement <16 x half> %load, i32 3 @@ -163,9 +544,143 @@ ret void } -; GCN-LABEL: {{^}}v_extractelement_v8f16_dynamic_sgpr: -; GCN-COUNT-7: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 7 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx4 v[1:4], v[1:2] +; VI-NEXT: v_mov_b32_e32 v6, s5 +; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0 +; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 3 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 5 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 7 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; VI-NEXT: flat_store_short v[5:6], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[1:4], v1, s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NEXT: s_cmp_eq_u32 s0, 3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 5 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: s_cmp_eq_u32 s0, 7 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -176,9 +691,248 @@ ret void } -; GCN-LABEL: {{^}}v_extractelement_v16f16_dynamic_sgpr: -; GCN-COUNT-15: v_cndmask_b32_e32 define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 { +; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; SI-NEXT: v_mov_b32_e32 v6, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[0:1], s[6:7] +; SI-NEXT: buffer_load_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64 +; SI-NEXT: v_lshlrev_b32_e32 v9, 1, v0 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: s_mov_b64 s[6:7], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[5:8], v[5:6], s[0:3], 0 addr64 offset:16 +; SI-NEXT: s_cmp_eq_u32 s8, 1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 4 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 7 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 8 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 9 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 10 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 11 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 12 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 13 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 14 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 15 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, v[9:10], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v1, 5, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v1 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx4 v[1:4], v[5:6] +; VI-NEXT: v_add_u32_e32 v5, vcc, 16, v5 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: flat_load_dwordx4 v[5:8], v[5:6] +; VI-NEXT: v_mov_b32_e32 v10, s5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s4, v0 +; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 3 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 5 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 7 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 8 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 10 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 11 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 12 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 13 +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 14 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v15, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s0, 15 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; VI-NEXT: flat_store_short v[9:10], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] +; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7] offset:16 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: s_cmp_eq_u32 s0, 3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 4 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 5 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: s_cmp_eq_u32 s0, 7 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: s_cmp_eq_u32 s0, 9 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: s_cmp_eq_u32 s0, 11 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GFX11-NEXT: s_cmp_eq_u32 s0, 13 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: s_cmp_eq_u32 s0, 15 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext