diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s -; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,MESA,CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HSA %s declare hidden void @external_void_func_i1(i1) #0 declare hidden void @external_void_func_i1_signext(i1 signext) #0 @@ -57,221 +59,1422 @@ declare hidden void @external_void_func_v16i8(<16 x i8>) #0 - ; FIXME: Should be passing -1 -; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm: -; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD - -; MESA-DAG: s_mov_b64 s[0:1], s[36:37] - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+12 -; GCN-DAG: v_mov_b32_e32 v0, 1{{$}} -; MESA-DAG: s_mov_b64 s[2:3], s[38:39] - -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { +; VI-LABEL: test_call_external_void_func_i1_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i1_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i1_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i1_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i1_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i1(i1 true) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: - -; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA: s_mov_b32 s32, 0 -; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] -; MESA-DAG: s_mov_b32 s32, 0{{$}} - -; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+12 -; GCN-NEXT: v_bfe_i32 v0, [[VAR]], 0, 1 -; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i1_signext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i1_signext: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; MESA-NEXT: v_bfe_i32 v0, v0, 0, 1 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i1_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i1_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext@rel32@hi+12 +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i1_signext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12 +; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) undef call void @external_void_func_i1_signext(i1 signext %var) ret void } ; FIXME: load should be scheduled before getpc -; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: - -; HSA: buffer_load_ubyte [[VAL:v[0-9]+]] -; HSA-DAG: s_mov_b32 s32, 0{{$}} - -; MESA: buffer_load_ubyte [[VAL:v[0-9]+]] -; MESA-DAG: s_mov_b32 s32, 0{{$}} - -; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+12 -; GCN-NEXT: v_and_b32_e32 v0, 1, [[VAL]] -; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i1_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i1_zeroext: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; MESA-NEXT: v_and_b32_e32 v0, 1, v0 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i1_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i1_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext@rel32@hi+12 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i1_zeroext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_zeroext@rel32@hi+12 +; HSA-NEXT: v_and_b32_e32 v0, 1, v0 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) undef call void @external_void_func_i1_zeroext(i1 zeroext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+12 -; GCN-DAG: v_mov_b32_e32 v0, 0x7b - -; GCN-DAG: s_mov_b32 s32, 0{{$}} - -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { +; VI-LABEL: test_call_external_void_func_i8_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i8_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 0x7b +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i8_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i8_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i8_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i8(i8 123) ret void } ; FIXME: don't wait before call -; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: - -; GCN-DAG: buffer_load_sbyte [[VAL:v[0-9]+]] -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i8_signext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i8_signext: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i8_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i8_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i8_signext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_signext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef call void @external_void_func_i8_signext(i8 signext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: - -; GCN-DAG: buffer_load_ubyte [[VAL:v[0-9]+]] -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i8_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i8_zeroext: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i8_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i8_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i8_zeroext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_zeroext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef call void @external_void_func_i8_zeroext(i8 zeroext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm: -; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { +; VI-LABEL: test_call_external_void_func_i16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i16_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 0x7b +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i16(i16 123) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: - -; GCN-DAG: buffer_load_sshort [[VAL:v[0-9]+]] -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i16_signext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i16_signext: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i16_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i16_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i16_signext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_sshort v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_signext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_signext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef call void @external_void_func_i16_signext(i16 signext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext: - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+12 - -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { +; VI-LABEL: test_call_external_void_func_i16_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i16_zeroext: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i16_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i16_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i16_zeroext: +; HSA: ; %bb.0: +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_zeroext@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_zeroext@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef call void @external_void_func_i16_zeroext(i16 zeroext %var) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: - -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+12 -; GCN-DAG: v_mov_b32_e32 v0, 42 -; GCN-DAG: s_mov_b32 s32, 0 - -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { +; VI-LABEL: test_call_external_void_func_i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 42 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 42 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i32(i32 42) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm: -; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} -; GCN-DAG: v_mov_b32_e32 v1, 0{{$}} -; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]] -; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4 -; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+12 -; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]] -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { +; VI-LABEL: test_call_external_void_func_i64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_i64_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 0x7b +; MESA-NEXT: v_mov_b32_e32 v1, 0 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_i64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_i64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_i64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_i64(i64 123) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i64: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { +; VI-LABEL: test_call_external_void_func_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2i64: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b32 s0, 0 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_mov_b32 s1, s0 +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i64: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) null call void @external_void_func_v2i64(<2 x i64> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2i64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2i64_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: v_mov_b32_e32 v2, 3 +; MESA-NEXT: v_mov_b32_e32 v3, 4 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2i64(<2 x i64> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i64: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN: v_mov_b32_e32 v4, 1 -; GCN: v_mov_b32_e32 v5, 2 -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { +; VI-LABEL: test_call_external_void_func_v3i64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, 2 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3i64: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b32 s0, 0 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_mov_b32 s1, s0 +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v4, 1 +; MESA-NEXT: v_mov_b32_e32 v5, 2 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i64: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v5, 2 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> , <3 x i32> @@ -279,343 +1482,3110 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN-DAG: v_mov_b32_e32 v4, 1 -; GCN-DAG: v_mov_b32_e32 v5, 2 -; GCN-DAG: v_mov_b32_e32 v6, 3 -; GCN-DAG: v_mov_b32_e32 v7, 4 - -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { +; VI-LABEL: test_call_external_void_func_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, 2 +; VI-NEXT: v_mov_b32_e32 v6, 3 +; VI-NEXT: v_mov_b32_e32 v7, 4 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v4i64: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b32 s0, 0 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_mov_b32 s1, s0 +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v4, 1 +; MESA-NEXT: v_mov_b32_e32 v5, 2 +; MESA-NEXT: v_mov_b32_e32 v6, 3 +; MESA-NEXT: v_mov_b32_e32 v7, 4 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s1, s0 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: v_mov_b32_e32 v6, 3 +; GFX9-NEXT: v_mov_b32_e32 v7, 4 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i64: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s8, 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s9, s8 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: v_mov_b32_e32 v5, 2 +; HSA-NEXT: v_mov_b32_e32 v6, 3 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v7, 4 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null %val = shufflevector <2 x i64> %load, <2 x i64> , <4 x i32> call void @external_void_func_v4i64(<4 x i64> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm: -; VI: v_mov_b32_e32 v0, 0x4400 -; CI: v_mov_b32_e32 v0, 4.0 -; GCN-NOT: v0 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { +; VI-LABEL: test_call_external_void_func_f16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_f16_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 4.0 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_f16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_f16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_f16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_f16(half 4.0) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm: -; GCN: v_mov_b32_e32 v0, 4.0 -; GCN-NOT: v0 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 4.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_f32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 4.0 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 4.0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_f32(float 4.0) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1.0 -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2f32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1.0 +; MESA-NEXT: v_mov_b32_e32 v1, 2.0 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1.0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2f32(<2 x float> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1.0 -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN-DAG: v_mov_b32_e32 v2, 4.0 -; GCN-NOT: v3, -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 4.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3f32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1.0 +; MESA-NEXT: v_mov_b32_e32 v1, 2.0 +; MESA-NEXT: v_mov_b32_e32 v2, 4.0 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1.0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 4.0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3f32(<3 x float> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v5f32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1.0 -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN-DAG: v_mov_b32_e32 v2, 4.0 -; GCN-DAG: v_mov_b32_e32 v3, -1.0 -; GCN-DAG: v_mov_b32_e32 v4, 0.5 -; GCN-NOT: v5, -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v5f32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 4.0 +; VI-NEXT: v_mov_b32_e32 v3, -1.0 +; VI-NEXT: v_mov_b32_e32 v4, 0.5 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v5f32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1.0 +; MESA-NEXT: v_mov_b32_e32 v1, 2.0 +; MESA-NEXT: v_mov_b32_e32 v2, 4.0 +; MESA-NEXT: v_mov_b32_e32 v3, -1.0 +; MESA-NEXT: v_mov_b32_e32 v4, 0.5 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v5f32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v5f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v5f32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1.0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 4.0 +; HSA-NEXT: v_mov_b32_e32 v3, -1.0 +; HSA-NEXT: v_mov_b32_e32 v4, 0.5 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v5f32(<5 x float> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm: -; GCN: v_mov_b32_e32 v0, 0{{$}} -; GCN: v_mov_b32_e32 v1, 0x40100000 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { +; VI-LABEL: test_call_external_void_func_f64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x40100000 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_f64_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 0x40100000 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_f64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_f64(double 4.0) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm: -; GCN: v_mov_b32_e32 v0, 0{{$}} -; GCN: v_mov_b32_e32 v1, 2.0 -; GCN: v_mov_b32_e32 v2, 0{{$}} -; GCN: v_mov_b32_e32 v3, 0x40100000 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2f64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40100000 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2f64_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 2.0 +; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v3, 0x40100000 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2f64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2f64(<2 x double> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm: -; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} -; GCN-DAG: v_mov_b32_e32 v1, 2.0 -; GCN-DAG: v_mov_b32_e32 v2, 0{{$}} -; GCN-DAG: v_mov_b32_e32 v3, 0x40100000 -; GCN-DAG: v_mov_b32_e32 v4, 0{{$}} -; GCN-DAG: v_mov_b32_e32 v5, 0x40200000 -; GCN-DAG: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3f64_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 2.0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0x40100000 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 0x40200000 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3f64_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 2.0 +; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v3, 0x40100000 +; MESA-NEXT: v_mov_b32_e32 v4, 0 +; MESA-NEXT: v_mov_b32_e32 v5, 0x40200000 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f64_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f64_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 2.0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 +; HSA-NEXT: v_mov_b32_e32 v4, 0 +; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3f64(<3 x double> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i16: -; GFX9: buffer_load_dword v0 -; GFX9-NOT: v0 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { +; VI-LABEL: test_call_external_void_func_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2i16: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x i16>, ptr addrspace(1) undef call void @external_void_func_v2i16(<2 x i16> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i16: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { +; VI-LABEL: test_call_external_void_func_v3i16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3i16: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; MESA-NEXT: v_mov_b32_e32 v0, v2 +; MESA-NEXT: v_mov_b32_e32 v2, v3 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <3 x i16>, ptr addrspace(1) undef call void @external_void_func_v3i16(<3 x i16> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f16: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { +; VI-LABEL: test_call_external_void_func_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3f16: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dwordx2 v[1:2], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: v_cvt_f32_f16_e32 v0, v1 +; MESA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; MESA-NEXT: v_cvt_f32_f16_e32 v2, v2 +; MESA-NEXT: v_cvt_f32_f16_e32 v1, v1 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) undef call void @external_void_func_v3f16(<3 x half> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm: -; GFX9: v_mov_b32_e32 v0, 0x20001 -; GFX9: v_mov_b32_e32 v1, 3 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3i16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x20001 +; VI-NEXT: v_mov_b32_e32 v1, 3 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3i16_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: v_mov_b32_e32 v2, 3 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 +; HSA-NEXT: v_mov_b32_e32 v1, 3 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3i16(<3 x i16> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3f16_imm: -; GFX9: v_mov_b32_e32 v0, 0x40003c00 -; GFX9: v_mov_b32_e32 v1, 0x4400 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { +; VI-LABEL: test_call_external_void_func_v3f16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3f16_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1.0 +; MESA-NEXT: v_mov_b32_e32 v1, 2.0 +; MESA-NEXT: v_mov_b32_e32 v2, 4.0 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3f16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3f16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3f16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3f16(<3 x half> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i16: -; GFX9: buffer_load_dwordx2 v[0:1] -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { +; VI-LABEL: test_call_external_void_func_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v4i16: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; MESA-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; MESA-NEXT: v_mov_b32_e32 v2, v1 +; MESA-NEXT: v_mov_b32_e32 v1, v4 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <4 x i16>, ptr addrspace(1) undef call void @external_void_func_v4i16(<4 x i16> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm: -; GFX9-DAG: v_mov_b32_e32 v0, 0x20001 -; GFX9-DAG: v_mov_b32_e32 v1, 0x40003 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { +; VI-LABEL: test_call_external_void_func_v4i16_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x20001 +; VI-NEXT: v_mov_b32_e32 v1, 0x40003 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v4i16_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: v_mov_b32_e32 v2, 3 +; MESA-NEXT: v_mov_b32_e32 v3, 4 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i16_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i16_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 +; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v4i16(<4 x i16> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2f16: -; GFX9: buffer_load_dword v0 -; GFX9-NOT: v0 -; GFX9: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { +; VI-LABEL: test_call_external_void_func_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2f16: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: v_cvt_f32_f16_e32 v0, v1 +; MESA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; MESA-NEXT: v_cvt_f32_f16_e32 v1, v1 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2f16: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f16@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) undef call void @external_void_func_v2f16(<2 x half> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i32: -; GCN: buffer_load_dwordx2 v[0:1] -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { +; VI-LABEL: test_call_external_void_func_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) undef call void @external_void_func_v2i32(<2 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v2i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v2i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v2i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v2i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v2i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v2i32(<2 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} - -; GCN-NOT: v3{{$}} -; GCN-DAG: v_mov_b32_e32 v0, 3 -; GCN-DAG: v_mov_b32_e32 v1, 4 -; GCN-DAG: v_mov_b32_e32 v2, 5 - -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { +; VI-LABEL: test_call_external_void_func_v3i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: v_mov_b32_e32 v1, 4 +; VI-NEXT: v_mov_b32_e32 v2, 5 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 3 +; MESA-NEXT: v_mov_b32_e32 v1, 4 +; MESA-NEXT: v_mov_b32_e32 v2, 5 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_mov_b32_e32 v2, 5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: v_mov_b32_e32 v1, 4 +; HSA-NEXT: v_mov_b32_e32 v2, 5 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3i32(<3 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32: -; GCN-DAG: v_mov_b32_e32 v0, 3 -; GCN-DAG: v_mov_b32_e32 v1, 4 -; GCN-DAG: v_mov_b32_e32 v2, 5 -; GCN-DAG: v_mov_b32_e32 v3, 6 define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { +; VI-LABEL: test_call_external_void_func_v3i32_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: v_mov_b32_e32 v1, 4 +; VI-NEXT: v_mov_b32_e32 v2, 5 +; VI-NEXT: v_mov_b32_e32 v3, 6 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v3i32_i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 3 +; MESA-NEXT: v_mov_b32_e32 v1, 4 +; MESA-NEXT: v_mov_b32_e32 v2, 5 +; MESA-NEXT: v_mov_b32_e32 v3, 6 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v3i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_mov_b32_e32 v3, 6 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v3i32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v3i32_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: v_mov_b32_e32 v1, 4 +; HSA-NEXT: v_mov_b32_e32 v2, 5 +; HSA-NEXT: v_mov_b32_e32 v3, 6 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v3i32_i32(<3 x i32> , i32 6) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i32: -; GCN: buffer_load_dwordx4 v[0:3] -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { +; VI-LABEL: test_call_external_void_func_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v4i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) undef call void @external_void_func_v4i32(<4 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v4i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v4i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: v_mov_b32_e32 v2, 3 +; MESA-NEXT: v_mov_b32_e32 v3, 4 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v4i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v4i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v4i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v4i32(<4 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v5i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN-DAG: v_mov_b32_e32 v4, 5 -; GCN-NOT: v5, -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v5i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: v_mov_b32_e32 v4, 5 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v5i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: v_mov_b32_e32 v2, 3 +; MESA-NEXT: v_mov_b32_e32 v3, 4 +; MESA-NEXT: v_mov_b32_e32 v4, 5 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v5i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v5i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_mov_b32_e32 v4, 5 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v5i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: v_mov_b32_e32 v4, 5 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v5i32(<5 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { +; VI-LABEL: test_call_external_void_func_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v8i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v8i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v8i32(<8 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm: -; GCN-DAG: v_mov_b32_e32 v0, 1 -; GCN-DAG: v_mov_b32_e32 v1, 2 -; GCN-DAG: v_mov_b32_e32 v2, 3 -; GCN-DAG: v_mov_b32_e32 v3, 4 -; GCN-DAG: v_mov_b32_e32 v4, 5 -; GCN-DAG: v_mov_b32_e32 v5, 6 -; GCN-DAG: v_mov_b32_e32 v6, 7 -; GCN-DAG: v_mov_b32_e32 v7, 8 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { +; VI-LABEL: test_call_external_void_func_v8i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, 2 +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_mov_b32_e32 v3, 4 +; VI-NEXT: v_mov_b32_e32 v4, 5 +; VI-NEXT: v_mov_b32_e32 v5, 6 +; VI-NEXT: v_mov_b32_e32 v6, 7 +; VI-NEXT: v_mov_b32_e32 v7, 8 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v8i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: v_mov_b32_e32 v0, 1 +; MESA-NEXT: v_mov_b32_e32 v1, 2 +; MESA-NEXT: v_mov_b32_e32 v2, 3 +; MESA-NEXT: v_mov_b32_e32 v3, 4 +; MESA-NEXT: v_mov_b32_e32 v4, 5 +; MESA-NEXT: v_mov_b32_e32 v5, 6 +; MESA-NEXT: v_mov_b32_e32 v6, 7 +; MESA-NEXT: v_mov_b32_e32 v7, 8 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v8i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: v_mov_b32_e32 v5, 6 +; GFX9-NEXT: v_mov_b32_e32 v6, 7 +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v8i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 +; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v8i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: v_mov_b32_e32 v1, 2 +; HSA-NEXT: v_mov_b32_e32 v2, 3 +; HSA-NEXT: v_mov_b32_e32 v3, 4 +; HSA-NEXT: v_mov_b32_e32 v4, 5 +; HSA-NEXT: v_mov_b32_e32 v5, 6 +; HSA-NEXT: v_mov_b32_e32 v6, 7 +; HSA-NEXT: v_mov_b32_e32 v7, 8 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm call void @external_void_func_v8i32(<8 x i32> ) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v16i32: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-NOT: s_waitcnt -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { +; VI-LABEL: test_call_external_void_func_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v16i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; MESA-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; MESA-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v16i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v16i32(<16 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v32i32: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: buffer_store_dword v31, off, s{{\[[0-9]+:[0-9]+\]}}, s32 -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { +; VI-LABEL: test_call_external_void_func_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[8:9] +; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v32i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; MESA-NEXT: s_mov_b32 s7, 0xf000 +; MESA-NEXT: s_mov_b32 s6, -1 +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; MESA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; MESA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; MESA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; MESA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; MESA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; MESA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_getpc_b64 s[8:9] +; MESA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(7) +; MESA-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; MESA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[8:9] +; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b32 off, v31, s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v32i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[12:13] +; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(7) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <32 x i32>, ptr addrspace(1) %ptr call void @external_void_func_v32i32(<32 x i32> %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32: -; HSA-NOT: s_add_u32 s32 - -; MESA-NOT: s_add_u32 s32 - -; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off - -; GCN: s_waitcnt -; GCN-DAG: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword v31, off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}} -; GCN: s_swappc_b64 -; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { +; VI-LABEL: test_call_external_void_func_v32i32_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v32i32_i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; MESA-NEXT: s_mov_b32 s7, 0xf000 +; MESA-NEXT: s_mov_b32 s6, -1 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; MESA-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; MESA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; MESA-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; MESA-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; MESA-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; MESA-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; MESA-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(8) +; MESA-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; MESA-NEXT: s_waitcnt vmcnt(8) +; MESA-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v32i32_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v32i32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v32i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v32i32_i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112 +; GFX11-NEXT: buffer_load_b32 v32, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_add_i32 s4, s32, 4 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b32 off, v31, s32 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b32 off, v32, s4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v32i32_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dword v32, off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 +; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 +; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(8) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(8) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef %val0 = load <32 x i32>, ptr addrspace(1) %ptr0 %val1 = load i32, ptr addrspace(1) undef @@ -623,54 +4593,366 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm: -; GCN: v_mov_b32_e32 v0, 42 -; GCN: s_swappc_b64 s[30:31], -; GCN-NOT: s_waitcnt -; GCN: buffer_store_dword v0, off, s[36:39], 0 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 { +; VI-LABEL: test_call_external_i32_func_i32_imm: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s42, -1 +; VI-NEXT: s_mov_b32 s43, 0xe80000 +; VI-NEXT: s_add_u32 s40, s40, s5 +; VI-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; VI-NEXT: s_addc_u32 s41, s41, 0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[40:41] +; VI-NEXT: s_mov_b64 s[2:3], s[42:43] +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_mov_b32 s39, 0xf000 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_i32_func_i32_imm: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s42, -1 +; MESA-NEXT: s_mov_b32 s43, 0xe8f000 +; MESA-NEXT: s_add_u32 s40, s40, s5 +; MESA-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; MESA-NEXT: s_addc_u32 s41, s41, 0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[40:41] +; MESA-NEXT: s_mov_b64 s[2:3], s[42:43] +; MESA-NEXT: v_mov_b32_e32 v0, 42 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_mov_b32 s39, 0xf000 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_i32_func_i32_imm: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: s_add_u32 s40, s40, s5 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_mov_b32 s39, 0xf000 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_i32_func_i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[36:37], s[2:3], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_mov_b32 s39, 0x31016000 +; GFX11-NEXT: s_mov_b32 s38, -1 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_i32_func_i32_imm: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 42 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_mov_b32 s39, 0x1100f000 +; HSA-NEXT: s_mov_b32 s38, -1 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_endpgm %val = call i32 @external_i32_func_i32(i32 42) store volatile i32 %val, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: -; GCN: buffer_load_ubyte v0, off -; GCN: buffer_load_dword v1, off -; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { +; VI-LABEL: test_call_external_void_func_struct_i8_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_struct_i8_i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; MESA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_struct_i8_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; HSA-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef %val = load { i8, i32 }, ptr addrspace(1) %ptr0 call void @external_void_func_struct_i8_i32({ i8, i32 } %val) ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32: -; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 -; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], 0 offset:8 -; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], 0 offset:12 - -; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8 -; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12 - -; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12 -; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8 - -; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12 -; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8 - -; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}} - -; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} -; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 - -; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}} -; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4 - -; GCN-NEXT: s_swappc_b64 -; GCN-NOT: [[SP]] define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 { +; VI-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 8 +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_movk_i32 s32, 0x400 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: v_mov_b32_e32 v0, 3 +; MESA-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; MESA-NEXT: v_mov_b32_e32 v0, 8 +; MESA-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; MESA-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; MESA-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_movk_i32 s32, 0x400 +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_movk_i32 s32, 0x400 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_mov_b32 s32, 16 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; HSA-NEXT: s_movk_i32 s32, 0x400 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %val = alloca { i8, i32 }, align 8, addrspace(5) %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0 %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1 @@ -680,28 +4962,186 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: -; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x800{{$}} - -; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 -; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 - -; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 - -; GCN-NOT: s_add_u32 [[SP]] -; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}} -; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 -; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:20 -; GCN-NOT: s_sub_u32 [[SP]] - -; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off -; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { +; VI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s5 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: v_mov_b32_e32 v0, 3 +; VI-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 8 +; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; VI-NEXT: s_movk_i32 s32, 0x800 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; VI-NEXT: v_mov_b32_e32 v0, 16 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 +; VI-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s5 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: v_mov_b32_e32 v0, 3 +; MESA-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; MESA-NEXT: v_mov_b32_e32 v0, 8 +; MESA-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; MESA-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; MESA-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; MESA-NEXT: s_movk_i32 s32, 0x800 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; MESA-NEXT: v_mov_b32_e32 v0, 16 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 +; MESA-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s5 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 3 +; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:12 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:8 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:20 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_mov_b32 s32, 32 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 off, v0, off offset:8 +; GFX11-NEXT: scratch_store_b32 off, v1, off offset:12 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, off offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[0:1], s32 +; GFX11-NEXT: v_mov_b32_e32 v0, 16 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u8 v0, off, off offset:16 +; GFX11-NEXT: scratch_load_b32 v1, off, off offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: v_mov_b32_e32 v0, 3 +; HSA-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 +; HSA-NEXT: s_movk_i32 s32, 0x800 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 16 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 +; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:20 +; HSA-NEXT: s_mov_b32 s7, 0x1100f000 +; HSA-NEXT: s_mov_b32 s6, -1 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_endpgm %in.val = alloca { i8, i32 }, align 8, addrspace(5) %out.val = alloca { i8, i32 }, align 8, addrspace(5) %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0 @@ -719,74 +5159,1000 @@ ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v16i8: define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { +; VI-LABEL: test_call_external_void_func_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s38, -1 +; VI-NEXT: s_mov_b32 s39, 0xe80000 +; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_addc_u32 s37, s37, 0 +; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v12, v3 +; VI-NEXT: v_mov_b32_e32 v1, v16 +; VI-NEXT: v_mov_b32_e32 v2, v17 +; VI-NEXT: v_mov_b32_e32 v3, v18 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: test_call_external_void_func_v16i8: +; MESA: ; %bb.0: +; MESA-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; MESA-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s38, -1 +; MESA-NEXT: s_mov_b32 s39, 0xe8f000 +; MESA-NEXT: s_add_u32 s36, s36, s3 +; MESA-NEXT: s_mov_b32 s3, 0xf000 +; MESA-NEXT: s_mov_b32 s2, -1 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; MESA-NEXT: s_addc_u32 s37, s37, 0 +; MESA-NEXT: s_mov_b64 s[0:1], s[36:37] +; MESA-NEXT: s_mov_b64 s[2:3], s[38:39] +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; MESA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; MESA-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; MESA-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; MESA-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; MESA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; MESA-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; MESA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; MESA-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; MESA-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; MESA-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; MESA-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; MESA-NEXT: v_mov_b32_e32 v4, v1 +; MESA-NEXT: v_mov_b32_e32 v8, v2 +; MESA-NEXT: v_mov_b32_e32 v12, v3 +; MESA-NEXT: v_mov_b32_e32 v1, v16 +; MESA-NEXT: v_mov_b32_e32 v2, v17 +; MESA-NEXT: v_mov_b32_e32 v3, v18 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: test_call_external_void_func_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v12, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v16 +; GFX9-NEXT: v_mov_b32_e32 v2, v17 +; GFX9-NEXT: v_mov_b32_e32 v3, v18 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: test_call_external_void_func_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8@rel32@hi+12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16 +; GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18 +; GFX11-NEXT: v_mov_b32_e32 v2, v17 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: test_call_external_void_func_v16i8: +; HSA: ; %bb.0: +; HSA-NEXT: s_add_i32 s6, s6, s9 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 +; HSA-NEXT: s_add_u32 s0, s0, s9 +; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; HSA-NEXT: s_mov_b32 s11, 0x1100f000 +; HSA-NEXT: s_mov_b32 s10, -1 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i8@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i8@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; HSA-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; HSA-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; HSA-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; HSA-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; HSA-NEXT: v_mov_b32_e32 v4, v1 +; HSA-NEXT: v_mov_b32_e32 v8, v2 +; HSA-NEXT: v_mov_b32_e32 v12, v3 +; HSA-NEXT: v_mov_b32_e32 v1, v16 +; HSA-NEXT: v_mov_b32_e32 v2, v17 +; HSA-NEXT: v_mov_b32_e32 v3, v18 +; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] +; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i8>, ptr addrspace(1) %ptr call void @external_void_func_v16i8(<16 x i8> %val) ret void } -; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 -; GCN: s_swappc_b64 define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { +; VI-LABEL: stack_passed_arg_alignment_v32i32_f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s54, -1 +; VI-NEXT: s_mov_b32 s55, 0xe80000 +; VI-NEXT: s_add_u32 s52, s52, s5 +; VI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; VI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_addc_u32 s53, s53, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s23 +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], s[52:53] +; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; VI-NEXT: s_mov_b64 s[2:3], s[54:55] +; VI-NEXT: v_mov_b32_e32 v0, s36 +; VI-NEXT: v_mov_b32_e32 v1, s37 +; VI-NEXT: v_mov_b32_e32 v2, s38 +; VI-NEXT: v_mov_b32_e32 v3, s39 +; VI-NEXT: v_mov_b32_e32 v4, s40 +; VI-NEXT: v_mov_b32_e32 v5, s41 +; VI-NEXT: v_mov_b32_e32 v6, s42 +; VI-NEXT: v_mov_b32_e32 v7, s43 +; VI-NEXT: v_mov_b32_e32 v8, s44 +; VI-NEXT: v_mov_b32_e32 v9, s45 +; VI-NEXT: v_mov_b32_e32 v10, s46 +; VI-NEXT: v_mov_b32_e32 v11, s47 +; VI-NEXT: v_mov_b32_e32 v12, s48 +; VI-NEXT: v_mov_b32_e32 v13, s49 +; VI-NEXT: v_mov_b32_e32 v14, s50 +; VI-NEXT: v_mov_b32_e32 v15, s51 +; VI-NEXT: v_mov_b32_e32 v16, s8 +; VI-NEXT: v_mov_b32_e32 v17, s9 +; VI-NEXT: v_mov_b32_e32 v18, s10 +; VI-NEXT: v_mov_b32_e32 v19, s11 +; VI-NEXT: v_mov_b32_e32 v20, s12 +; VI-NEXT: v_mov_b32_e32 v21, s13 +; VI-NEXT: v_mov_b32_e32 v22, s14 +; VI-NEXT: v_mov_b32_e32 v23, s15 +; VI-NEXT: v_mov_b32_e32 v24, s16 +; VI-NEXT: v_mov_b32_e32 v25, s17 +; VI-NEXT: v_mov_b32_e32 v26, s18 +; VI-NEXT: v_mov_b32_e32 v27, s19 +; VI-NEXT: v_mov_b32_e32 v28, s20 +; VI-NEXT: v_mov_b32_e32 v29, s21 +; VI-NEXT: v_mov_b32_e32 v30, s22 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: s_endpgm +; +; MESA-LABEL: stack_passed_arg_alignment_v32i32_f64: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; MESA-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; MESA-NEXT: s_mov_b32 s54, -1 +; MESA-NEXT: s_mov_b32 s55, 0xe8f000 +; MESA-NEXT: s_add_u32 s52, s52, s5 +; MESA-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x19 +; MESA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x29 +; MESA-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x9 +; MESA-NEXT: s_mov_b32 s32, 0 +; MESA-NEXT: s_addc_u32 s53, s53, 0 +; MESA-NEXT: s_waitcnt lgkmcnt(0) +; MESA-NEXT: v_mov_b32_e32 v0, s23 +; MESA-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; MESA-NEXT: v_mov_b32_e32 v0, s4 +; MESA-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; MESA-NEXT: v_mov_b32_e32 v0, s5 +; MESA-NEXT: s_mov_b64 s[6:7], s[0:1] +; MESA-NEXT: s_mov_b64 s[0:1], s[52:53] +; MESA-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; MESA-NEXT: s_mov_b64 s[2:3], s[54:55] +; MESA-NEXT: v_mov_b32_e32 v0, s36 +; MESA-NEXT: v_mov_b32_e32 v1, s37 +; MESA-NEXT: v_mov_b32_e32 v2, s38 +; MESA-NEXT: v_mov_b32_e32 v3, s39 +; MESA-NEXT: v_mov_b32_e32 v4, s40 +; MESA-NEXT: v_mov_b32_e32 v5, s41 +; MESA-NEXT: v_mov_b32_e32 v6, s42 +; MESA-NEXT: v_mov_b32_e32 v7, s43 +; MESA-NEXT: v_mov_b32_e32 v8, s44 +; MESA-NEXT: v_mov_b32_e32 v9, s45 +; MESA-NEXT: v_mov_b32_e32 v10, s46 +; MESA-NEXT: v_mov_b32_e32 v11, s47 +; MESA-NEXT: v_mov_b32_e32 v12, s48 +; MESA-NEXT: v_mov_b32_e32 v13, s49 +; MESA-NEXT: v_mov_b32_e32 v14, s50 +; MESA-NEXT: v_mov_b32_e32 v15, s51 +; MESA-NEXT: v_mov_b32_e32 v16, s8 +; MESA-NEXT: v_mov_b32_e32 v17, s9 +; MESA-NEXT: v_mov_b32_e32 v18, s10 +; MESA-NEXT: v_mov_b32_e32 v19, s11 +; MESA-NEXT: v_mov_b32_e32 v20, s12 +; MESA-NEXT: v_mov_b32_e32 v21, s13 +; MESA-NEXT: v_mov_b32_e32 v22, s14 +; MESA-NEXT: v_mov_b32_e32 v23, s15 +; MESA-NEXT: v_mov_b32_e32 v24, s16 +; MESA-NEXT: v_mov_b32_e32 v25, s17 +; MESA-NEXT: v_mov_b32_e32 v26, s18 +; MESA-NEXT: v_mov_b32_e32 v27, s19 +; MESA-NEXT: v_mov_b32_e32 v28, s20 +; MESA-NEXT: v_mov_b32_e32 v29, s21 +; MESA-NEXT: v_mov_b32_e32 v30, s22 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: s_endpgm +; +; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s54, -1 +; GFX9-NEXT: s_mov_b32 s55, 0xe00000 +; GFX9-NEXT: s_add_u32 s52, s52, s5 +; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4 +; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_addc_u32 s53, s53, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s23 +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX9-NEXT: v_mov_b32_e32 v0, s36 +; GFX9-NEXT: v_mov_b32_e32 v1, s37 +; GFX9-NEXT: v_mov_b32_e32 v2, s38 +; GFX9-NEXT: v_mov_b32_e32 v3, s39 +; GFX9-NEXT: v_mov_b32_e32 v4, s40 +; GFX9-NEXT: v_mov_b32_e32 v5, s41 +; GFX9-NEXT: v_mov_b32_e32 v6, s42 +; GFX9-NEXT: v_mov_b32_e32 v7, s43 +; GFX9-NEXT: v_mov_b32_e32 v8, s44 +; GFX9-NEXT: v_mov_b32_e32 v9, s45 +; GFX9-NEXT: v_mov_b32_e32 v10, s46 +; GFX9-NEXT: v_mov_b32_e32 v11, s47 +; GFX9-NEXT: v_mov_b32_e32 v12, s48 +; GFX9-NEXT: v_mov_b32_e32 v13, s49 +; GFX9-NEXT: v_mov_b32_e32 v14, s50 +; GFX9-NEXT: v_mov_b32_e32 v15, s51 +; GFX9-NEXT: v_mov_b32_e32 v16, s8 +; GFX9-NEXT: v_mov_b32_e32 v17, s9 +; GFX9-NEXT: v_mov_b32_e32 v18, s10 +; GFX9-NEXT: v_mov_b32_e32 v19, s11 +; GFX9-NEXT: v_mov_b32_e32 v20, s12 +; GFX9-NEXT: v_mov_b32_e32 v21, s13 +; GFX9-NEXT: v_mov_b32_e32 v22, s14 +; GFX9-NEXT: v_mov_b32_e32 v23, s15 +; GFX9-NEXT: v_mov_b32_e32 v24, s16 +; GFX9-NEXT: v_mov_b32_e32 v25, s17 +; GFX9-NEXT: v_mov_b32_e32 v26, s18 +; GFX9-NEXT: v_mov_b32_e32 v27, s19 +; GFX9-NEXT: v_mov_b32_e32 v28, s20 +; GFX9-NEXT: v_mov_b32_e32 v29, s21 +; GFX9-NEXT: v_mov_b32_e32 v30, s22 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[20:21], s[2:3], 0xa4 +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x64 +; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s22, s32, 8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20 +; GFX11-NEXT: v_mov_b32_e32 v2, s19 +; GFX11-NEXT: s_add_i32 s19, s32, 4 +; GFX11-NEXT: v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43 +; GFX11-NEXT: scratch_store_b32 off, v0, s22 +; GFX11-NEXT: scratch_store_b32 off, v1, s19 +; GFX11-NEXT: scratch_store_b32 off, v2, s32 +; GFX11-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39 +; GFX11-NEXT: v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38 +; GFX11-NEXT: v_dual_mov_b32 v5, s41 :: v_dual_mov_b32 v6, s42 +; GFX11-NEXT: v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v8, s44 +; GFX11-NEXT: v_dual_mov_b32 v11, s47 :: v_dual_mov_b32 v10, s46 +; GFX11-NEXT: v_dual_mov_b32 v13, s49 :: v_dual_mov_b32 v12, s48 +; GFX11-NEXT: v_dual_mov_b32 v15, s51 :: v_dual_mov_b32 v14, s50 +; GFX11-NEXT: v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4 +; GFX11-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6 +; GFX11-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8 +; GFX11-NEXT: v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10 +; GFX11-NEXT: v_dual_mov_b32 v25, s13 :: v_dual_mov_b32 v24, s12 +; GFX11-NEXT: v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14 +; GFX11-NEXT: v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16 +; GFX11-NEXT: v_mov_b32_e32 v30, s18 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: s_endpgm +; +; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_add_i32 s8, s8, s11 +; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_add_u32 s0, s0, s11 +; HSA-NEXT: s_load_dwordx16 s[8:23], s[6:7], 0x40 +; HSA-NEXT: s_load_dwordx2 s[24:25], s[6:7], 0x80 +; HSA-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_waitcnt lgkmcnt(0) +; HSA-NEXT: v_mov_b32_e32 v0, s23 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, s24 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, s25 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, s36 +; HSA-NEXT: v_mov_b32_e32 v1, s37 +; HSA-NEXT: v_mov_b32_e32 v2, s38 +; HSA-NEXT: v_mov_b32_e32 v3, s39 +; HSA-NEXT: v_mov_b32_e32 v4, s40 +; HSA-NEXT: v_mov_b32_e32 v5, s41 +; HSA-NEXT: v_mov_b32_e32 v6, s42 +; HSA-NEXT: v_mov_b32_e32 v7, s43 +; HSA-NEXT: v_mov_b32_e32 v8, s44 +; HSA-NEXT: v_mov_b32_e32 v9, s45 +; HSA-NEXT: v_mov_b32_e32 v10, s46 +; HSA-NEXT: v_mov_b32_e32 v11, s47 +; HSA-NEXT: v_mov_b32_e32 v12, s48 +; HSA-NEXT: v_mov_b32_e32 v13, s49 +; HSA-NEXT: v_mov_b32_e32 v14, s50 +; HSA-NEXT: v_mov_b32_e32 v15, s51 +; HSA-NEXT: v_mov_b32_e32 v16, s8 +; HSA-NEXT: v_mov_b32_e32 v17, s9 +; HSA-NEXT: v_mov_b32_e32 v18, s10 +; HSA-NEXT: v_mov_b32_e32 v19, s11 +; HSA-NEXT: v_mov_b32_e32 v20, s12 +; HSA-NEXT: v_mov_b32_e32 v21, s13 +; HSA-NEXT: v_mov_b32_e32 v22, s14 +; HSA-NEXT: v_mov_b32_e32 v23, s15 +; HSA-NEXT: v_mov_b32_e32 v24, s16 +; HSA-NEXT: v_mov_b32_e32 v25, s17 +; HSA-NEXT: v_mov_b32_e32 v26, s18 +; HSA-NEXT: v_mov_b32_e32 v27, s19 +; HSA-NEXT: v_mov_b32_e32 v28, s20 +; HSA-NEXT: v_mov_b32_e32 v29, s21 +; HSA-NEXT: v_mov_b32_e32 v30, s22 +; HSA-NEXT: s_getpc_b64 s[24:25] +; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25] +; HSA-NEXT: s_endpgm entry: call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) ret void } -; GCN-LABEL: {{^}}tail_call_byval_align16: -; GCN-NOT: s32 -; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:28 -; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32{{$}} - -; GCN: s_getpc_b64 - -; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:20 -; GCN: buffer_load_dword [[VREG3:v[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}} -; GCN: buffer_store_dword [[VREG3]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { +; VI-LABEL: tail_call_byval_align16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; VI-NEXT: s_setpc_b64 s[4:5] +; +; MESA-LABEL: tail_call_byval_align16: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MESA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; MESA-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; MESA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; MESA-NEXT: s_waitcnt vmcnt(2) +; MESA-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; MESA-NEXT: s_waitcnt vmcnt(1) +; MESA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; MESA-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: tail_call_byval_align16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX11-LABEL: tail_call_byval_align16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v31, s32 +; GFX11-NEXT: scratch_load_b64 v[31:32], off, s32 offset:24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[31:32], s32 offset:16 +; GFX11-NEXT: s_setpc_b64 s[0:1] +; +; HSA-LABEL: tail_call_byval_align16: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20 +; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 +; HSA-NEXT: s_waitcnt vmcnt(1) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16 +; HSA-NEXT: s_setpc_b64 s[4:5] entry: %alloca = alloca double, align 8, addrspace(5) tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca) ret void } -; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64: -; GCN-NOT: s32 -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}} -; GCN: s_getpc_b64 -; GCN: buffer_store_dword v31, off, s[0:3], s32{{$}} -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 -; GCN-NOT: s32 -; GCN: s_setpc_b64 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 { +; VI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: s_setpc_b64 s[4:5] +; +; MESA-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MESA-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; MESA-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; MESA-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; MESA-NEXT: s_waitcnt vmcnt(2) +; MESA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; MESA-NEXT: s_waitcnt vmcnt(2) +; MESA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; MESA-NEXT: s_waitcnt vmcnt(2) +; MESA-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; MESA-NEXT: s_setpc_b64 s[4:5] +; +; GFX9-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_setpc_b64 s[4:5] +; +; GFX11-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 +; GFX11-NEXT: scratch_load_b64 v[31:32], off, s32 offset:4 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b32 off, v33, s32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[31:32], s32 offset:4 +; GFX11-NEXT: s_setpc_b64 s[0:1] +; +; HSA-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; HSA-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; HSA-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 +; HSA-NEXT: s_waitcnt vmcnt(2) +; HSA-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_setpc_b64 s[4:5] entry: tail call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp) ret void } -; GCN-LABEL: {{^}}stack_12xv3i32: -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], off, s[0:3], s32{{$}} -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 -; GCN: s_getpc define void @stack_12xv3i32() #0 { +; VI-LABEL: stack_12xv3i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 15 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 1 +; VI-NEXT: v_mov_b32_e32 v4, 1 +; VI-NEXT: v_mov_b32_e32 v5, 1 +; VI-NEXT: v_mov_b32_e32 v6, 2 +; VI-NEXT: v_mov_b32_e32 v7, 2 +; VI-NEXT: v_mov_b32_e32 v8, 2 +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_mov_b32_e32 v10, 3 +; VI-NEXT: v_mov_b32_e32 v11, 3 +; VI-NEXT: v_mov_b32_e32 v12, 4 +; VI-NEXT: v_mov_b32_e32 v13, 4 +; VI-NEXT: v_mov_b32_e32 v14, 4 +; VI-NEXT: v_mov_b32_e32 v15, 5 +; VI-NEXT: v_mov_b32_e32 v16, 5 +; VI-NEXT: v_mov_b32_e32 v17, 5 +; VI-NEXT: v_mov_b32_e32 v18, 6 +; VI-NEXT: v_mov_b32_e32 v19, 6 +; VI-NEXT: v_mov_b32_e32 v20, 6 +; VI-NEXT: v_mov_b32_e32 v21, 7 +; VI-NEXT: v_mov_b32_e32 v22, 7 +; VI-NEXT: v_mov_b32_e32 v23, 7 +; VI-NEXT: v_mov_b32_e32 v24, 8 +; VI-NEXT: v_mov_b32_e32 v25, 8 +; VI-NEXT: v_mov_b32_e32 v26, 8 +; VI-NEXT: v_mov_b32_e32 v27, 9 +; VI-NEXT: v_mov_b32_e32 v28, 9 +; VI-NEXT: v_mov_b32_e32 v29, 9 +; VI-NEXT: v_mov_b32_e32 v30, 10 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; MESA-LABEL: stack_12xv3i32: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MESA-NEXT: s_mov_b32 s4, s33 +; MESA-NEXT: s_mov_b32 s33, s32 +; MESA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; MESA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MESA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MESA-NEXT: s_mov_b64 exec, s[8:9] +; MESA-NEXT: s_addk_i32 s32, 0x400 +; MESA-NEXT: v_mov_b32_e32 v0, 11 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MESA-NEXT: v_mov_b32_e32 v0, 12 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; MESA-NEXT: v_mov_b32_e32 v0, 13 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; MESA-NEXT: v_mov_b32_e32 v0, 14 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; MESA-NEXT: v_mov_b32_e32 v0, 15 +; MESA-NEXT: v_writelane_b32 v40, s30, 0 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 0 +; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v3, 1 +; MESA-NEXT: v_mov_b32_e32 v4, 1 +; MESA-NEXT: v_mov_b32_e32 v5, 1 +; MESA-NEXT: v_mov_b32_e32 v6, 2 +; MESA-NEXT: v_mov_b32_e32 v7, 2 +; MESA-NEXT: v_mov_b32_e32 v8, 2 +; MESA-NEXT: v_mov_b32_e32 v9, 3 +; MESA-NEXT: v_mov_b32_e32 v10, 3 +; MESA-NEXT: v_mov_b32_e32 v11, 3 +; MESA-NEXT: v_mov_b32_e32 v12, 4 +; MESA-NEXT: v_mov_b32_e32 v13, 4 +; MESA-NEXT: v_mov_b32_e32 v14, 4 +; MESA-NEXT: v_mov_b32_e32 v15, 5 +; MESA-NEXT: v_mov_b32_e32 v16, 5 +; MESA-NEXT: v_mov_b32_e32 v17, 5 +; MESA-NEXT: v_mov_b32_e32 v18, 6 +; MESA-NEXT: v_mov_b32_e32 v19, 6 +; MESA-NEXT: v_mov_b32_e32 v20, 6 +; MESA-NEXT: v_mov_b32_e32 v21, 7 +; MESA-NEXT: v_mov_b32_e32 v22, 7 +; MESA-NEXT: v_mov_b32_e32 v23, 7 +; MESA-NEXT: v_mov_b32_e32 v24, 8 +; MESA-NEXT: v_mov_b32_e32 v25, 8 +; MESA-NEXT: v_mov_b32_e32 v26, 8 +; MESA-NEXT: v_mov_b32_e32 v27, 9 +; MESA-NEXT: v_mov_b32_e32 v28, 9 +; MESA-NEXT: v_mov_b32_e32 v29, 9 +; MESA-NEXT: v_mov_b32_e32 v30, 10 +; MESA-NEXT: v_writelane_b32 v41, s4, 0 +; MESA-NEXT: v_writelane_b32 v40, s31, 1 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: v_readlane_b32 s31, v40, 1 +; MESA-NEXT: v_readlane_b32 s30, v40, 0 +; MESA-NEXT: v_readlane_b32 s4, v41, 0 +; MESA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MESA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MESA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MESA-NEXT: s_mov_b64 exec, s[6:7] +; MESA-NEXT: s_addk_i32 s32, 0xfc00 +; MESA-NEXT: s_mov_b32 s33, s4 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_12xv3i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_mov_b32_e32 v5, 1 +; GFX9-NEXT: v_mov_b32_e32 v6, 2 +; GFX9-NEXT: v_mov_b32_e32 v7, 2 +; GFX9-NEXT: v_mov_b32_e32 v8, 2 +; GFX9-NEXT: v_mov_b32_e32 v9, 3 +; GFX9-NEXT: v_mov_b32_e32 v10, 3 +; GFX9-NEXT: v_mov_b32_e32 v11, 3 +; GFX9-NEXT: v_mov_b32_e32 v12, 4 +; GFX9-NEXT: v_mov_b32_e32 v13, 4 +; GFX9-NEXT: v_mov_b32_e32 v14, 4 +; GFX9-NEXT: v_mov_b32_e32 v15, 5 +; GFX9-NEXT: v_mov_b32_e32 v16, 5 +; GFX9-NEXT: v_mov_b32_e32 v17, 5 +; GFX9-NEXT: v_mov_b32_e32 v18, 6 +; GFX9-NEXT: v_mov_b32_e32 v19, 6 +; GFX9-NEXT: v_mov_b32_e32 v20, 6 +; GFX9-NEXT: v_mov_b32_e32 v21, 7 +; GFX9-NEXT: v_mov_b32_e32 v22, 7 +; GFX9-NEXT: v_mov_b32_e32 v23, 7 +; GFX9-NEXT: v_mov_b32_e32 v24, 8 +; GFX9-NEXT: v_mov_b32_e32 v25, 8 +; GFX9-NEXT: v_mov_b32_e32 v26, 8 +; GFX9-NEXT: v_mov_b32_e32 v27, 9 +; GFX9-NEXT: v_mov_b32_e32 v28, 9 +; GFX9-NEXT: v_mov_b32_e32 v29, 9 +; GFX9-NEXT: v_mov_b32_e32 v30, 10 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_12xv3i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12 +; GFX11-NEXT: v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14 +; GFX11-NEXT: v_mov_b32_e32 v4, 15 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b32 off, v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v3, 1 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 1 :: v_dual_mov_b32 v4, 1 +; GFX11-NEXT: v_dual_mov_b32 v7, 2 :: v_dual_mov_b32 v6, 2 +; GFX11-NEXT: v_dual_mov_b32 v9, 3 :: v_dual_mov_b32 v8, 2 +; GFX11-NEXT: v_dual_mov_b32 v11, 3 :: v_dual_mov_b32 v10, 3 +; GFX11-NEXT: v_dual_mov_b32 v13, 4 :: v_dual_mov_b32 v12, 4 +; GFX11-NEXT: v_dual_mov_b32 v15, 5 :: v_dual_mov_b32 v14, 4 +; GFX11-NEXT: v_dual_mov_b32 v17, 5 :: v_dual_mov_b32 v16, 5 +; GFX11-NEXT: v_dual_mov_b32 v19, 6 :: v_dual_mov_b32 v18, 6 +; GFX11-NEXT: v_dual_mov_b32 v21, 7 :: v_dual_mov_b32 v20, 6 +; GFX11-NEXT: v_dual_mov_b32 v23, 7 :: v_dual_mov_b32 v22, 7 +; GFX11-NEXT: v_dual_mov_b32 v25, 8 :: v_dual_mov_b32 v24, 8 +; GFX11-NEXT: v_dual_mov_b32 v27, 9 :: v_dual_mov_b32 v26, 8 +; GFX11-NEXT: v_dual_mov_b32 v29, 9 :: v_dual_mov_b32 v28, 9 +; GFX11-NEXT: v_mov_b32_e32 v30, 10 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_12xv3i32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 11 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 12 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 13 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 14 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 15 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 1 +; HSA-NEXT: v_mov_b32_e32 v4, 1 +; HSA-NEXT: v_mov_b32_e32 v5, 1 +; HSA-NEXT: v_mov_b32_e32 v6, 2 +; HSA-NEXT: v_mov_b32_e32 v7, 2 +; HSA-NEXT: v_mov_b32_e32 v8, 2 +; HSA-NEXT: v_mov_b32_e32 v9, 3 +; HSA-NEXT: v_mov_b32_e32 v10, 3 +; HSA-NEXT: v_mov_b32_e32 v11, 3 +; HSA-NEXT: v_mov_b32_e32 v12, 4 +; HSA-NEXT: v_mov_b32_e32 v13, 4 +; HSA-NEXT: v_mov_b32_e32 v14, 4 +; HSA-NEXT: v_mov_b32_e32 v15, 5 +; HSA-NEXT: v_mov_b32_e32 v16, 5 +; HSA-NEXT: v_mov_b32_e32 v17, 5 +; HSA-NEXT: v_mov_b32_e32 v18, 6 +; HSA-NEXT: v_mov_b32_e32 v19, 6 +; HSA-NEXT: v_mov_b32_e32 v20, 6 +; HSA-NEXT: v_mov_b32_e32 v21, 7 +; HSA-NEXT: v_mov_b32_e32 v22, 7 +; HSA-NEXT: v_mov_b32_e32 v23, 7 +; HSA-NEXT: v_mov_b32_e32 v24, 8 +; HSA-NEXT: v_mov_b32_e32 v25, 8 +; HSA-NEXT: v_mov_b32_e32 v26, 8 +; HSA-NEXT: v_mov_b32_e32 v27, 9 +; HSA-NEXT: v_mov_b32_e32 v28, 9 +; HSA-NEXT: v_mov_b32_e32 v29, 9 +; HSA-NEXT: v_mov_b32_e32 v30, 10 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_12xv3i32( <3 x i32>, @@ -804,19 +6170,345 @@ ret void } -; GCN-LABEL: {{^}}stack_12xv3f32: -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16 -; GCN: s_getpc define void @stack_12xv3f32() #0 { +; VI-LABEL: stack_12xv3f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 0x41300000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 0x41400000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 1.0 +; VI-NEXT: v_mov_b32_e32 v4, 1.0 +; VI-NEXT: v_mov_b32_e32 v5, 1.0 +; VI-NEXT: v_mov_b32_e32 v6, 2.0 +; VI-NEXT: v_mov_b32_e32 v7, 2.0 +; VI-NEXT: v_mov_b32_e32 v8, 2.0 +; VI-NEXT: v_mov_b32_e32 v9, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v10, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v11, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v12, 4.0 +; VI-NEXT: v_mov_b32_e32 v13, 4.0 +; VI-NEXT: v_mov_b32_e32 v14, 4.0 +; VI-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; VI-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; VI-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; VI-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; VI-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; VI-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; VI-NEXT: v_mov_b32_e32 v24, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v25, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v26, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v27, 0x41100000 +; VI-NEXT: v_mov_b32_e32 v28, 0x41100000 +; VI-NEXT: v_mov_b32_e32 v29, 0x41100000 +; VI-NEXT: v_mov_b32_e32 v30, 0x41200000 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; MESA-LABEL: stack_12xv3f32: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MESA-NEXT: s_mov_b32 s4, s33 +; MESA-NEXT: s_mov_b32 s33, s32 +; MESA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; MESA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MESA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MESA-NEXT: s_mov_b64 exec, s[8:9] +; MESA-NEXT: s_addk_i32 s32, 0x400 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41300000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41400000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41500000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41700000 +; MESA-NEXT: v_writelane_b32 v40, s30, 0 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 0 +; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v3, 1.0 +; MESA-NEXT: v_mov_b32_e32 v4, 1.0 +; MESA-NEXT: v_mov_b32_e32 v5, 1.0 +; MESA-NEXT: v_mov_b32_e32 v6, 2.0 +; MESA-NEXT: v_mov_b32_e32 v7, 2.0 +; MESA-NEXT: v_mov_b32_e32 v8, 2.0 +; MESA-NEXT: v_mov_b32_e32 v9, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v10, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v11, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v12, 4.0 +; MESA-NEXT: v_mov_b32_e32 v13, 4.0 +; MESA-NEXT: v_mov_b32_e32 v14, 4.0 +; MESA-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; MESA-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; MESA-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; MESA-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; MESA-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; MESA-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; MESA-NEXT: v_mov_b32_e32 v24, 0x41000000 +; MESA-NEXT: v_mov_b32_e32 v25, 0x41000000 +; MESA-NEXT: v_mov_b32_e32 v26, 0x41000000 +; MESA-NEXT: v_mov_b32_e32 v27, 0x41100000 +; MESA-NEXT: v_mov_b32_e32 v28, 0x41100000 +; MESA-NEXT: v_mov_b32_e32 v29, 0x41100000 +; MESA-NEXT: v_mov_b32_e32 v30, 0x41200000 +; MESA-NEXT: v_writelane_b32 v41, s4, 0 +; MESA-NEXT: v_writelane_b32 v40, s31, 1 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: v_readlane_b32 s31, v40, 1 +; MESA-NEXT: v_readlane_b32 s30, v40, 0 +; MESA-NEXT: v_readlane_b32 s4, v41, 0 +; MESA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MESA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MESA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MESA-NEXT: s_mov_b64 exec, s[6:7] +; MESA-NEXT: s_addk_i32 s32, 0xfc00 +; MESA-NEXT: s_mov_b32 s33, s4 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_12xv3f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v4, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v6, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v7, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v11, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v12, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v13, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v14, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; GFX9-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; GFX9-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; GFX9-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; GFX9-NEXT: v_mov_b32_e32 v24, 0x41000000 +; GFX9-NEXT: v_mov_b32_e32 v25, 0x41000000 +; GFX9-NEXT: v_mov_b32_e32 v26, 0x41000000 +; GFX9-NEXT: v_mov_b32_e32 v27, 0x41100000 +; GFX9-NEXT: v_mov_b32_e32 v28, 0x41100000 +; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000 +; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_12xv3f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41400000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x41600000 +; GFX11-NEXT: v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b32 off, v4, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v6, 2.0 :: v_dual_mov_b32 v9, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v11, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v10, 0x40400000 :: v_dual_mov_b32 v13, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v12, 4.0 :: v_dual_mov_b32 v15, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v14, 4.0 :: v_dual_mov_b32 v17, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v18, 0x40c00000 :: v_dual_mov_b32 v19, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; GFX11-NEXT: v_dual_mov_b32 v21, 0x40e00000 :: v_dual_mov_b32 v22, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; GFX11-NEXT: v_dual_mov_b32 v24, 0x41000000 :: v_dual_mov_b32 v25, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v26, 0x41000000 +; GFX11-NEXT: v_dual_mov_b32 v27, 0x41100000 :: v_dual_mov_b32 v28, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v29, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v30, 0x41200000 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_12xv3f32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41400000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 1.0 +; HSA-NEXT: v_mov_b32_e32 v4, 1.0 +; HSA-NEXT: v_mov_b32_e32 v5, 1.0 +; HSA-NEXT: v_mov_b32_e32 v6, 2.0 +; HSA-NEXT: v_mov_b32_e32 v7, 2.0 +; HSA-NEXT: v_mov_b32_e32 v8, 2.0 +; HSA-NEXT: v_mov_b32_e32 v9, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v10, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v11, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v12, 4.0 +; HSA-NEXT: v_mov_b32_e32 v13, 4.0 +; HSA-NEXT: v_mov_b32_e32 v14, 4.0 +; HSA-NEXT: v_mov_b32_e32 v15, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v16, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v17, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v18, 0x40c00000 +; HSA-NEXT: v_mov_b32_e32 v19, 0x40c00000 +; HSA-NEXT: v_mov_b32_e32 v20, 0x40c00000 +; HSA-NEXT: v_mov_b32_e32 v21, 0x40e00000 +; HSA-NEXT: v_mov_b32_e32 v22, 0x40e00000 +; HSA-NEXT: v_mov_b32_e32 v23, 0x40e00000 +; HSA-NEXT: v_mov_b32_e32 v24, 0x41000000 +; HSA-NEXT: v_mov_b32_e32 v25, 0x41000000 +; HSA-NEXT: v_mov_b32_e32 v26, 0x41000000 +; HSA-NEXT: v_mov_b32_e32 v27, 0x41100000 +; HSA-NEXT: v_mov_b32_e32 v28, 0x41100000 +; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000 +; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_12xv3f32( <3 x float>, @@ -834,27 +6526,378 @@ ret void } -; GCN-LABEL: {{^}}stack_8xv5i32: -; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 7 -; GCN: buffer_store_dword [[REG7]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 -; GCN: s_getpc define void @stack_8xv5i32() #0 { +; VI-LABEL: stack_8xv5i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; VI-NEXT: v_mov_b32_e32 v0, 13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; VI-NEXT: v_mov_b32_e32 v0, 14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; VI-NEXT: v_mov_b32_e32 v0, 15 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 1 +; VI-NEXT: v_mov_b32_e32 v6, 1 +; VI-NEXT: v_mov_b32_e32 v7, 1 +; VI-NEXT: v_mov_b32_e32 v8, 1 +; VI-NEXT: v_mov_b32_e32 v9, 1 +; VI-NEXT: v_mov_b32_e32 v10, 2 +; VI-NEXT: v_mov_b32_e32 v11, 2 +; VI-NEXT: v_mov_b32_e32 v12, 2 +; VI-NEXT: v_mov_b32_e32 v13, 2 +; VI-NEXT: v_mov_b32_e32 v14, 2 +; VI-NEXT: v_mov_b32_e32 v15, 3 +; VI-NEXT: v_mov_b32_e32 v16, 3 +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_mov_b32_e32 v18, 3 +; VI-NEXT: v_mov_b32_e32 v19, 3 +; VI-NEXT: v_mov_b32_e32 v20, 4 +; VI-NEXT: v_mov_b32_e32 v21, 4 +; VI-NEXT: v_mov_b32_e32 v22, 4 +; VI-NEXT: v_mov_b32_e32 v23, 4 +; VI-NEXT: v_mov_b32_e32 v24, 4 +; VI-NEXT: v_mov_b32_e32 v25, 5 +; VI-NEXT: v_mov_b32_e32 v26, 5 +; VI-NEXT: v_mov_b32_e32 v27, 5 +; VI-NEXT: v_mov_b32_e32 v28, 5 +; VI-NEXT: v_mov_b32_e32 v29, 5 +; VI-NEXT: v_mov_b32_e32 v30, 6 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; MESA-LABEL: stack_8xv5i32: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MESA-NEXT: s_mov_b32 s4, s33 +; MESA-NEXT: s_mov_b32 s33, s32 +; MESA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; MESA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MESA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MESA-NEXT: s_mov_b64 exec, s[8:9] +; MESA-NEXT: s_addk_i32 s32, 0x400 +; MESA-NEXT: v_mov_b32_e32 v0, 7 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MESA-NEXT: v_mov_b32_e32 v0, 8 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; MESA-NEXT: v_mov_b32_e32 v0, 9 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; MESA-NEXT: v_mov_b32_e32 v0, 10 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; MESA-NEXT: v_mov_b32_e32 v0, 11 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; MESA-NEXT: v_mov_b32_e32 v0, 12 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; MESA-NEXT: v_mov_b32_e32 v0, 13 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; MESA-NEXT: v_mov_b32_e32 v0, 14 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; MESA-NEXT: v_mov_b32_e32 v0, 15 +; MESA-NEXT: v_writelane_b32 v40, s30, 0 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 0 +; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v3, 0 +; MESA-NEXT: v_mov_b32_e32 v4, 0 +; MESA-NEXT: v_mov_b32_e32 v5, 1 +; MESA-NEXT: v_mov_b32_e32 v6, 1 +; MESA-NEXT: v_mov_b32_e32 v7, 1 +; MESA-NEXT: v_mov_b32_e32 v8, 1 +; MESA-NEXT: v_mov_b32_e32 v9, 1 +; MESA-NEXT: v_mov_b32_e32 v10, 2 +; MESA-NEXT: v_mov_b32_e32 v11, 2 +; MESA-NEXT: v_mov_b32_e32 v12, 2 +; MESA-NEXT: v_mov_b32_e32 v13, 2 +; MESA-NEXT: v_mov_b32_e32 v14, 2 +; MESA-NEXT: v_mov_b32_e32 v15, 3 +; MESA-NEXT: v_mov_b32_e32 v16, 3 +; MESA-NEXT: v_mov_b32_e32 v17, 3 +; MESA-NEXT: v_mov_b32_e32 v18, 3 +; MESA-NEXT: v_mov_b32_e32 v19, 3 +; MESA-NEXT: v_mov_b32_e32 v20, 4 +; MESA-NEXT: v_mov_b32_e32 v21, 4 +; MESA-NEXT: v_mov_b32_e32 v22, 4 +; MESA-NEXT: v_mov_b32_e32 v23, 4 +; MESA-NEXT: v_mov_b32_e32 v24, 4 +; MESA-NEXT: v_mov_b32_e32 v25, 5 +; MESA-NEXT: v_mov_b32_e32 v26, 5 +; MESA-NEXT: v_mov_b32_e32 v27, 5 +; MESA-NEXT: v_mov_b32_e32 v28, 5 +; MESA-NEXT: v_mov_b32_e32 v29, 5 +; MESA-NEXT: v_mov_b32_e32 v30, 6 +; MESA-NEXT: v_writelane_b32 v41, s4, 0 +; MESA-NEXT: v_writelane_b32 v40, s31, 1 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: v_readlane_b32 s31, v40, 1 +; MESA-NEXT: v_readlane_b32 s30, v40, 0 +; MESA-NEXT: v_readlane_b32 s4, v41, 0 +; MESA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MESA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MESA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MESA-NEXT: s_mov_b64 exec, s[6:7] +; MESA-NEXT: s_addk_i32 s32, 0xfc00 +; MESA-NEXT: s_mov_b32 s33, s4 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_8xv5i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 7 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 11 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, 14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 15 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1 +; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 1 +; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v10, 2 +; GFX9-NEXT: v_mov_b32_e32 v11, 2 +; GFX9-NEXT: v_mov_b32_e32 v12, 2 +; GFX9-NEXT: v_mov_b32_e32 v13, 2 +; GFX9-NEXT: v_mov_b32_e32 v14, 2 +; GFX9-NEXT: v_mov_b32_e32 v15, 3 +; GFX9-NEXT: v_mov_b32_e32 v16, 3 +; GFX9-NEXT: v_mov_b32_e32 v17, 3 +; GFX9-NEXT: v_mov_b32_e32 v18, 3 +; GFX9-NEXT: v_mov_b32_e32 v19, 3 +; GFX9-NEXT: v_mov_b32_e32 v20, 4 +; GFX9-NEXT: v_mov_b32_e32 v21, 4 +; GFX9-NEXT: v_mov_b32_e32 v22, 4 +; GFX9-NEXT: v_mov_b32_e32 v23, 4 +; GFX9-NEXT: v_mov_b32_e32 v24, 4 +; GFX9-NEXT: v_mov_b32_e32 v25, 5 +; GFX9-NEXT: v_mov_b32_e32 v26, 5 +; GFX9-NEXT: v_mov_b32_e32 v27, 5 +; GFX9-NEXT: v_mov_b32_e32 v28, 5 +; GFX9-NEXT: v_mov_b32_e32 v29, 5 +; GFX9-NEXT: v_mov_b32_e32 v30, 6 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_8xv5i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10 +; GFX11-NEXT: v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14 +; GFX11-NEXT: v_mov_b32_e32 v6, 13 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 32 +; GFX11-NEXT: s_add_i32 s1, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: scratch_store_b32 off, v8, s0 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 1 +; GFX11-NEXT: v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v9, 1 +; GFX11-NEXT: v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v11, 2 +; GFX11-NEXT: v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v13, 2 +; GFX11-NEXT: v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v15, 3 +; GFX11-NEXT: v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v17, 3 +; GFX11-NEXT: v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v19, 3 +; GFX11-NEXT: v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v21, 4 +; GFX11-NEXT: v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v23, 4 +; GFX11-NEXT: v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v25, 5 +; GFX11-NEXT: v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v27, 5 +; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5 +; GFX11-NEXT: v_mov_b32_e32 v28, 5 +; GFX11-NEXT: v_mov_b32_e32 v30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_8xv5i32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 7 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 8 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 9 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 10 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 11 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 12 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; HSA-NEXT: v_mov_b32_e32 v0, 13 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; HSA-NEXT: v_mov_b32_e32 v0, 14 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; HSA-NEXT: v_mov_b32_e32 v0, 15 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 0 +; HSA-NEXT: v_mov_b32_e32 v5, 1 +; HSA-NEXT: v_mov_b32_e32 v6, 1 +; HSA-NEXT: v_mov_b32_e32 v7, 1 +; HSA-NEXT: v_mov_b32_e32 v8, 1 +; HSA-NEXT: v_mov_b32_e32 v9, 1 +; HSA-NEXT: v_mov_b32_e32 v10, 2 +; HSA-NEXT: v_mov_b32_e32 v11, 2 +; HSA-NEXT: v_mov_b32_e32 v12, 2 +; HSA-NEXT: v_mov_b32_e32 v13, 2 +; HSA-NEXT: v_mov_b32_e32 v14, 2 +; HSA-NEXT: v_mov_b32_e32 v15, 3 +; HSA-NEXT: v_mov_b32_e32 v16, 3 +; HSA-NEXT: v_mov_b32_e32 v17, 3 +; HSA-NEXT: v_mov_b32_e32 v18, 3 +; HSA-NEXT: v_mov_b32_e32 v19, 3 +; HSA-NEXT: v_mov_b32_e32 v20, 4 +; HSA-NEXT: v_mov_b32_e32 v21, 4 +; HSA-NEXT: v_mov_b32_e32 v22, 4 +; HSA-NEXT: v_mov_b32_e32 v23, 4 +; HSA-NEXT: v_mov_b32_e32 v24, 4 +; HSA-NEXT: v_mov_b32_e32 v25, 5 +; HSA-NEXT: v_mov_b32_e32 v26, 5 +; HSA-NEXT: v_mov_b32_e32 v27, 5 +; HSA-NEXT: v_mov_b32_e32 v28, 5 +; HSA-NEXT: v_mov_b32_e32 v29, 5 +; HSA-NEXT: v_mov_b32_e32 v30, 6 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_8xv5i32( <5 x i32>, @@ -868,27 +6911,381 @@ ret void } -; GCN-LABEL: {{^}}stack_8xv5f32: -; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 0x40e00000 -; GCN: buffer_store_dword [[REG7]], {{.*$}} -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32 -; GCN: s_getpc define void @stack_8xv5f32() #0 { +; VI-LABEL: stack_8xv5f32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s33 +; VI-NEXT: s_mov_b32 s33, s32 +; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[8:9] +; VI-NEXT: s_addk_i32 s32, 0x400 +; VI-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v0, 0x41000000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, 0x41100000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: v_mov_b32_e32 v0, 0x41200000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; VI-NEXT: v_mov_b32_e32 v0, 0x41300000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, 0x41400000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 +; VI-NEXT: v_writelane_b32 v40, s30, 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_mov_b32_e32 v5, 1.0 +; VI-NEXT: v_mov_b32_e32 v6, 1.0 +; VI-NEXT: v_mov_b32_e32 v7, 1.0 +; VI-NEXT: v_mov_b32_e32 v8, 1.0 +; VI-NEXT: v_mov_b32_e32 v9, 1.0 +; VI-NEXT: v_mov_b32_e32 v10, 2.0 +; VI-NEXT: v_mov_b32_e32 v11, 2.0 +; VI-NEXT: v_mov_b32_e32 v12, 2.0 +; VI-NEXT: v_mov_b32_e32 v13, 2.0 +; VI-NEXT: v_mov_b32_e32 v14, 2.0 +; VI-NEXT: v_mov_b32_e32 v15, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v16, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v17, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v18, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v19, 0x40400000 +; VI-NEXT: v_mov_b32_e32 v20, 4.0 +; VI-NEXT: v_mov_b32_e32 v21, 4.0 +; VI-NEXT: v_mov_b32_e32 v22, 4.0 +; VI-NEXT: v_mov_b32_e32 v23, 4.0 +; VI-NEXT: v_mov_b32_e32 v24, 4.0 +; VI-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; VI-NEXT: v_writelane_b32 v41, s4, 0 +; VI-NEXT: v_writelane_b32 v40, s31, 1 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: v_readlane_b32 s31, v40, 1 +; VI-NEXT: v_readlane_b32 s30, v40, 0 +; VI-NEXT: v_readlane_b32 s4, v41, 0 +; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 exec, s[6:7] +; VI-NEXT: s_addk_i32 s32, 0xfc00 +; VI-NEXT: s_mov_b32 s33, s4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; MESA-LABEL: stack_8xv5f32: +; MESA: ; %bb.0: ; %entry +; MESA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MESA-NEXT: s_mov_b32 s4, s33 +; MESA-NEXT: s_mov_b32 s33, s32 +; MESA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; MESA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MESA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MESA-NEXT: s_mov_b64 exec, s[8:9] +; MESA-NEXT: s_addk_i32 s32, 0x400 +; MESA-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41000000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41100000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41200000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41300000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41400000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41500000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; MESA-NEXT: v_mov_b32_e32 v0, 0x41700000 +; MESA-NEXT: v_writelane_b32 v40, s30, 0 +; MESA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; MESA-NEXT: v_mov_b32_e32 v0, 0 +; MESA-NEXT: v_mov_b32_e32 v1, 0 +; MESA-NEXT: v_mov_b32_e32 v2, 0 +; MESA-NEXT: v_mov_b32_e32 v3, 0 +; MESA-NEXT: v_mov_b32_e32 v4, 0 +; MESA-NEXT: v_mov_b32_e32 v5, 1.0 +; MESA-NEXT: v_mov_b32_e32 v6, 1.0 +; MESA-NEXT: v_mov_b32_e32 v7, 1.0 +; MESA-NEXT: v_mov_b32_e32 v8, 1.0 +; MESA-NEXT: v_mov_b32_e32 v9, 1.0 +; MESA-NEXT: v_mov_b32_e32 v10, 2.0 +; MESA-NEXT: v_mov_b32_e32 v11, 2.0 +; MESA-NEXT: v_mov_b32_e32 v12, 2.0 +; MESA-NEXT: v_mov_b32_e32 v13, 2.0 +; MESA-NEXT: v_mov_b32_e32 v14, 2.0 +; MESA-NEXT: v_mov_b32_e32 v15, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v16, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v17, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v18, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v19, 0x40400000 +; MESA-NEXT: v_mov_b32_e32 v20, 4.0 +; MESA-NEXT: v_mov_b32_e32 v21, 4.0 +; MESA-NEXT: v_mov_b32_e32 v22, 4.0 +; MESA-NEXT: v_mov_b32_e32 v23, 4.0 +; MESA-NEXT: v_mov_b32_e32 v24, 4.0 +; MESA-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; MESA-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; MESA-NEXT: v_writelane_b32 v41, s4, 0 +; MESA-NEXT: v_writelane_b32 v40, s31, 1 +; MESA-NEXT: s_getpc_b64 s[4:5] +; MESA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; MESA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; MESA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; MESA-NEXT: v_readlane_b32 s31, v40, 1 +; MESA-NEXT: v_readlane_b32 s30, v40, 0 +; MESA-NEXT: v_readlane_b32 s4, v41, 0 +; MESA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; MESA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MESA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MESA-NEXT: s_mov_b64 exec, s[6:7] +; MESA-NEXT: s_addk_i32 s32, 0xfc00 +; MESA-NEXT: s_mov_b32 s33, s4 +; MESA-NEXT: s_waitcnt vmcnt(0) +; MESA-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: stack_8xv5f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s33 +; GFX9-NEXT: s_mov_b32 s33, s32 +; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[8:9] +; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41100000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41200000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v8, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v9, 1.0 +; GFX9-NEXT: v_mov_b32_e32 v10, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v11, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v12, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v13, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v14, 2.0 +; GFX9-NEXT: v_mov_b32_e32 v15, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v16, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v19, 0x40400000 +; GFX9-NEXT: v_mov_b32_e32 v20, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v21, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v22, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v23, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v24, 4.0 +; GFX9-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 +; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[6:7] +; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_mov_b32 s33, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: stack_8xv5f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x41200000 +; GFX11-NEXT: v_mov_b32_e32 v8, 0x41700000 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_mov_b32_e32 v4, 0x41300000 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x41400000 +; GFX11-NEXT: v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0 +; GFX11-NEXT: v_mov_b32_e32 v7, 0x41600000 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: s_add_i32 s0, s32, 32 +; GFX11-NEXT: s_add_i32 s1, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: scratch_store_b32 off, v8, s0 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s1 +; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_mov_b32 v8, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v11, 2.0 :: v_dual_mov_b32 v10, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v13, 2.0 :: v_dual_mov_b32 v12, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v15, 0x40400000 :: v_dual_mov_b32 v14, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v17, 0x40400000 :: v_dual_mov_b32 v16, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v19, 0x40400000 :: v_dual_mov_b32 v18, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v21, 4.0 :: v_dual_mov_b32 v20, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v23, 4.0 :: v_dual_mov_b32 v22, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v25, 0x40a00000 :: v_dual_mov_b32 v24, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v29, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 +; GFX11-NEXT: s_or_saveexec_b32 s1, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_mov_b32 s33, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; HSA-LABEL: stack_8xv5f32: +; HSA: ; %bb.0: ; %entry +; HSA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s33, s32 +; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 +; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; HSA-NEXT: s_mov_b64 exec, s[8:9] +; HSA-NEXT: s_addk_i32 s32, 0x400 +; HSA-NEXT: v_mov_b32_e32 v0, 0x40e00000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41000000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41100000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41200000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41400000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 +; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 +; HSA-NEXT: v_writelane_b32 v40, s30, 0 +; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; HSA-NEXT: v_mov_b32_e32 v0, 0 +; HSA-NEXT: v_mov_b32_e32 v1, 0 +; HSA-NEXT: v_mov_b32_e32 v2, 0 +; HSA-NEXT: v_mov_b32_e32 v3, 0 +; HSA-NEXT: v_mov_b32_e32 v4, 0 +; HSA-NEXT: v_mov_b32_e32 v5, 1.0 +; HSA-NEXT: v_mov_b32_e32 v6, 1.0 +; HSA-NEXT: v_mov_b32_e32 v7, 1.0 +; HSA-NEXT: v_mov_b32_e32 v8, 1.0 +; HSA-NEXT: v_mov_b32_e32 v9, 1.0 +; HSA-NEXT: v_mov_b32_e32 v10, 2.0 +; HSA-NEXT: v_mov_b32_e32 v11, 2.0 +; HSA-NEXT: v_mov_b32_e32 v12, 2.0 +; HSA-NEXT: v_mov_b32_e32 v13, 2.0 +; HSA-NEXT: v_mov_b32_e32 v14, 2.0 +; HSA-NEXT: v_mov_b32_e32 v15, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v16, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v17, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v18, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v19, 0x40400000 +; HSA-NEXT: v_mov_b32_e32 v20, 4.0 +; HSA-NEXT: v_mov_b32_e32 v21, 4.0 +; HSA-NEXT: v_mov_b32_e32 v22, 4.0 +; HSA-NEXT: v_mov_b32_e32 v23, 4.0 +; HSA-NEXT: v_mov_b32_e32 v24, 4.0 +; HSA-NEXT: v_mov_b32_e32 v25, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v26, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v27, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v28, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 +; HSA-NEXT: v_writelane_b32 v40, s31, 1 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 +; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] +; HSA-NEXT: v_readlane_b32 s31, v40, 1 +; HSA-NEXT: v_readlane_b32 s30, v40, 0 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 +; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 +; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; HSA-NEXT: s_mov_b64 exec, s[6:7] +; HSA-NEXT: s_addk_i32 s32, 0xfc00 +; HSA-NEXT: s_mov_b32 s33, s4 +; HSA-NEXT: s_waitcnt vmcnt(0) +; HSA-NEXT: s_setpc_b64 s[30:31] entry: call void @external_void_func_8xv5f32( <5 x float>, @@ -912,9 +7309,10 @@ <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0 + attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind noinline } - - - +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s declare void @external_void_func_void() #0 @@ -178,6 +179,7 @@ ; GCN: s_swappc ; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] ; GFX89-DAG: buffer_store_dwordx3 v[0:2] +; GFX11-DAG: buffer_store_b96 v[0:2] define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { %val = call <3 x i32> @external_v3i32_func_void() store volatile <3 x i32> %val, ptr addrspace(1) undef, align 8 @@ -197,6 +199,8 @@ ; GFX7-DAG: flat_store_dword {{.*}}, v4 ; GFX89-DAG: buffer_store_dwordx4 v[0:3] ; GFX89-DAG: buffer_store_dword v4 +; GFX11-DAG: buffer_store_b128 v[0:3] +; GFX11-DAG: buffer_store_b32 v4 define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { %val = call <5 x i32> @external_v5i32_func_void() store volatile <5 x i32> %val, ptr addrspace(1) undef, align 8 @@ -254,7 +258,7 @@ ; GCN-LABEL: {{^}}test_call_external_v2i24_func_void: ; GCN: s_swappc_b64 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 +; GCN: v_add_{{(nc_)?}}{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 define amdgpu_kernel void @test_call_external_v2i24_func_void() #0 { %val = call <2 x i24> @external_v2i24_func_void() %elt0 = extractelement <2 x i24> %val, i32 0 @@ -268,6 +272,7 @@ ; GCN: s_swappc ; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] ; GFX89-DAG: buffer_store_dwordx3 v[0:2] +; GFX11-DAG: buffer_store_b96 v[0:2] define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { %val = call <3 x float> @external_v3f32_func_void() store volatile <3 x float> %val, ptr addrspace(1) undef @@ -280,6 +285,8 @@ ; GFX7-DAG: flat_store_dword {{.*}}, v4 ; GFX89-DAG: buffer_store_dwordx4 v[0:3] ; GFX89-DAG: buffer_store_dword v4 +; GFX11-DAG: buffer_store_b128 v[0:3] +; GFX11-DAG: buffer_store_b32 v4 define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 { %val = call <5 x float> @external_v5f32_func_void() store volatile <5 x float> %val, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -1,11 +1,39 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s ; Make sure we don't crash or assert on spir_kernel calling convention. -; GCN-LABEL: {{^}}kernel: -; GCN: s_endpgm define spir_kernel void @kernel(ptr addrspace(1) %out) { +; SI-LABEL: kernel: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: kernel: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: kernel: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: store i32 0, ptr addrspace(1) %out ret void @@ -20,324 +48,909 @@ ; ret void ; } -; GCN-LABEL: {{^}}ps_ret_cc_f16: -; SI: v_cvt_f16_f32_e32 v0, v0 -; SI: v_cvt_f32_f16_e32 v0, v0 -; SI: v_add_f32_e32 v0, 1.0, v0 - -; VI: v_add_f16_e32 v0, 1.0, v0 -; VI: ; return define amdgpu_ps half @ps_ret_cc_f16(half %arg0) { +; SI-LABEL: ps_ret_cc_f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_ret_cc_f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_ret_cc_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } -; GCN-LABEL: {{^}}ps_ret_cc_inreg_f16: -; SI: v_cvt_f16_f32_e32 v0, s0 -; SI: v_cvt_f32_f16_e32 v0, v0 -; SI: v_add_f32_e32 v0, 1.0, v0 - -; VI: v_add_f16_e64 v0, s0, 1.0 -; VI: ; return define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) { +; SI-LABEL: ps_ret_cc_inreg_f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_ret_cc_inreg_f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e64 v0, s0, 1.0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_ret_cc_inreg_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e64 v0, s0, 1.0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } -; GCN-LABEL: {{^}}fastcc: -; GCN: v_add_f32_e32 v0, 4.0, v0 define fastcc float @fastcc(float %arg0) #0 { +; SIVI-LABEL: fastcc: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fastcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %add = fadd float %arg0, 4.0 ret float %add } -; GCN-LABEL: {{^}}coldcc: -; GCN: v_add_f32_e32 v0, 4.0, v0 define coldcc float @coldcc(float %arg0) #0 { +; SIVI-LABEL: coldcc: +; SIVI: ; %bb.0: +; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SIVI-NEXT: v_add_f32_e32 v0, 4.0, v0 +; SIVI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: coldcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %add = fadd float %arg0, 4.0 ret float %add } -; GCN-LABEL: {{^}}call_coldcc: -; GCN: v_mov_b32_e32 v0, 1.0 -; GCN: s_swappc_b64 define amdgpu_kernel void @call_coldcc() #0 { +; SI-LABEL: call_coldcc: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: call_coldcc: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: s_mov_b64 s[2:3], s[90:91] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: call_coldcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, coldcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, coldcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) store float %val, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}call_fastcc: -; GCN: v_mov_b32_e32 v0, 1.0 -; GCN: s_swappc_b64 define amdgpu_kernel void @call_fastcc() #0 { +; SI-LABEL: call_fastcc: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s32, 0 +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s1 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_getpc_b64 s[0:1] +; SI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; SI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_mov_b64 s[0:1], s[8:9] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: call_fastcc: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s1 +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_getpc_b64 s[0:1] +; VI-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; VI-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; VI-NEXT: s_mov_b64 s[0:1], s[88:89] +; VI-NEXT: s_mov_b64 s[2:3], s[90:91] +; VI-NEXT: v_mov_b32_e32 v0, 1.0 +; VI-NEXT: s_mov_b32 s32, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: call_fastcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, fastcc@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, fastcc@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) store float %val, ptr addrspace(1) undef ret void } ; Mesa compute shader: check for 47176 (COMPUTE_PGM_RSRC1) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 47176 -; GCN-LABEL: {{^}}cs_mesa: define amdgpu_cs half @cs_mesa(half %arg0) { +; SI-LABEL: cs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: cs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: cs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 45096 -; GCN-LABEL: {{^}}ps_mesa_f16: define amdgpu_ps half @ps_mesa_f16(half %arg0) { +; SI-LABEL: ps_mesa_f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 45352 -; GCN-LABEL: {{^}}vs_mesa: define amdgpu_vs half @vs_mesa(half %arg0) { +; SI-LABEL: vs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: vs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: vs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa geometry shader: check for 45608 (SPI_SHADER_PGM_RSRC1_GS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 45608 -; GCN-LABEL: {{^}}gs_mesa: define amdgpu_gs half @gs_mesa(half %arg0) { +; SI-LABEL: gs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: gs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: gs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; Mesa hull shader: check for 46120 (SPI_SHADER_PGM_RSRC1_HS) in .AMDGPU.config -; GCN-LABEL: .AMDGPU.config -; GCN: .long 46120 -; GCN-LABEL: {{^}}hs_mesa: define amdgpu_hs half @hs_mesa(half %arg0) { +; SI-LABEL: hs_mesa: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: hs_mesa: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: hs_mesa: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: ; return to shader part epilog %add = fadd half %arg0, 1.0 ret half %add } ; FIXME: Inconsistent ABI between targets -; GCN-LABEL: {{^}}ps_mesa_v2f16: -; VI: v_mov_b32_e32 v1, 0x3c00 -; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: ; return - -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0 -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1 -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]] -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]] -; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]] -; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]] -; SI: ; return to shader part epilog + define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { +; SI-LABEL: ps_mesa_v2f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_v2f16: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, ret <2 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16: -; VI: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 -; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e64 v1, s0, 1.0 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: ; return to shader part epilog - -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0 -; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1 -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]] -; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]] -; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]] -; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]] -; SI: ; return to shader part epilog define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v2f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v2 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_inreg_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v1, s0, 1.0 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_inreg_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <2 x half> %arg0, ret <2 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_v2i16: -; VI: v_mov_b32_e32 v2, 1 -; VI: v_add_u16_e32 v1, 1, v0 -; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_or_b32_e32 v0, v1, v0 - - -; SI: v_lshlrev_b32_e32 v1, 16, v1 -; SI: v_add_i32_e32 v0, vcc, 1, v0 -; SI: v_and_b32 -; SI: v_or_b32 -; SI: v_add_i32_e32 v0, vcc, 0x10000, v0 define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { +; SI-LABEL: ps_mesa_v2i16: +; SI: ; %bb.0: +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x10000, v0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v2i16: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v2, 1 +; VI-NEXT: v_add_u16_e32 v1, 1, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, store <2 x i16> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16: -; VI: s_and_b32 s1, s0, 0xffff0000 -; VI: s_add_i32 s0, s0, 1 -; VI: s_and_b32 s0, s0, 0xffff -; VI: s_or_b32 s0, s1, s0 -; VI: s_add_i32 s0, s0, 0x10000 -; VI: v_mov_b32_e32 v0, s0 - -; SI: s_lshl_b32 s1, s1, 16 -; SI: s_add_i32 s0, s0, 1 -; SI: s_and_b32 s0, s0, 0xffff -; SI: s_or_b32 s0, s1, s0 -; SI: s_add_i32 s0, s0, 0x10000 define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_lshl_b32 s1, s1, 16 +; SI-NEXT: s_add_i32 s0, s0, 1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_and_b32 s0, s0, 0xffff +; SI-NEXT: s_or_b32 s0, s1, s0 +; SI-NEXT: s_add_i32 s0, s0, 0x10000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_and_b32 s1, s0, 0xffff0000 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_add_i32 s0, s0, 0x10000 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_sub_u16 v0, s0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, store <2 x i16> %add, ptr addrspace(1) undef ret void } ; FIXME: Differenet ABI for VI+ -; GCN-LABEL: {{^}}ps_mesa_v4f16: -; SI: v_cvt_f16_f32_e32 v3, v3 -; SI: v_cvt_f16_f32_e32 v2, v2 -; SI: v_cvt_f16_f32_e32 v1, v1 -; SI: v_cvt_f16_f32_e32 v0, v0 - -; VI: v_add_f16_e32 v2, 1.0, v1 -; VI: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_add_f16_e32 v4, 1.0, v0 -; VI: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) { +; SI-LABEL: ps_mesa_v4f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_v4f16: +; VI: ; %bb.0: +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_add_f16_e32 v2, 1.0, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 1.0, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <4 x half> %arg0, ret <4 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_inreg_v4f16: -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s3 -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s2 -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s1 -; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s0 - -; VI: v_add_f16_e64 -; VI: v_add_f16_sdwa -; VI: v_add_f16_e64 -; VI: v_add_f16_sdwa define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v4f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v5 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v4 +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: ps_mesa_inreg_v4f16: +; VI: ; %bb.0: +; VI-NEXT: v_add_f16_e64 v1, s1, 1.0 +; VI-NEXT: s_lshr_b32 s1, s1, 16 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v0, s0, 1.0 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ps_mesa_inreg_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ; return to shader part epilog %add = fadd <4 x half> %arg0, ret <4 x half> %add } -; GCN-LABEL: {{^}}ps_mesa_inreg_v3i32: -; GCN-DAG: s_add_i32 s0, s0, 1 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3 define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v3i32: +; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s1, s1, 2 +; SI-NEXT: s_add_i32 s0, s0, 1 +; SI-NEXT: s_add_i32 s4, s2, 3 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s2, s2, 3 +; VI-NEXT: s_add_i32 s1, s1, 2 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: s_add_i32 s1, s1, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, store <3 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v3f32: -; GCN-DAG: v_add_f32{{.*}}, s0, 1.0 -; GCN-DAG: v_add_f32{{.*}}, s1, 2.0 -; GCN-DAG: v_add_f32{{.*}}, s2, 4.0 define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v3f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v3f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, store <3 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v5i32: -; GCN-DAG: s_add_i32 s0, s0, 1 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s3, 4 -; GCN-DAG: s_add_i32 s{{[0-9]*}}, s4, 5 define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v5i32: +; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s5, s3, 4 +; SI-NEXT: s_add_i32 s6, s2, 3 +; SI-NEXT: s_add_i32 s1, s1, 2 +; SI-NEXT: s_add_i32 s0, s0, 1 +; SI-NEXT: s_add_i32 s4, s4, 5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v5i32: +; VI: ; %bb.0: +; VI-NEXT: s_add_i32 s4, s4, 5 +; VI-NEXT: s_add_i32 s3, s3, 4 +; VI-NEXT: s_add_i32 s2, s2, 3 +; VI-NEXT: s_add_i32 s1, s1, 2 +; VI-NEXT: s_add_i32 s0, s0, 1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_i32 s3, s3, 4 +; GFX11-NEXT: s_add_i32 s2, s2, 3 +; GFX11-NEXT: s_add_i32 s1, s1, 2 +; GFX11-NEXT: s_add_i32 s4, s4, 5 +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, store <5 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_v5f32: -; GCN-DAG: v_add_f32{{.*}}, s0, 1.0 -; GCN-DAG: v_add_f32{{.*}}, s1, 2.0 -; GCN-DAG: v_add_f32{{.*}}, s2, 4.0 -; GCN-DAG: v_add_f32{{.*}}, s3, -1.0 -; GCN-DAG: v_add_f32{{.*}}, s4, 0.5 define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_v5f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e64 v3, s3, -1.0 +; SI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; SI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; SI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s4, 0.5 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_v5f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e64 v3, s3, -1.0 +; VI-NEXT: v_add_f32_e64 v2, s2, 4.0 +; VI-NEXT: v_add_f32_e64 v1, s1, 2.0 +; VI-NEXT: v_add_f32_e64 v0, s0, 1.0 +; VI-NEXT: v_add_f32_e64 v4, s4, 0.5 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_v5f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0 +; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0 +; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 +; GFX11-NEXT: v_add_f32_e64 v4, s4, 0.5 +; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, store <5 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v3i32: -; GCN-DAG: v_add_{{.*}}, 1, v0 -; GCN-DAG: v_add_{{.*}}, 2, v1 -; GCN-DAG: v_add_{{.*}}, 3, v2 define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) { +; SI-LABEL: ps_mesa_v3i32: +; SI: ; %bb.0: +; SI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v3i32: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, store <3 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v3f32: -; GCN-DAG: v_add_{{.*}}, 1.0, v0 -; GCN-DAG: v_add_{{.*}}, 2.0, v1 -; GCN-DAG: v_add_{{.*}}, 4.0, v2 define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) { +; SI-LABEL: ps_mesa_v3f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v3f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: flat_store_dwordx3 v[0:1], v[0:2] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, store <3 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v5i32: -; GCN-DAG: v_add_{{.*}}, 1, v0 -; GCN-DAG: v_add_{{.*}}, 2, v1 -; GCN-DAG: v_add_{{.*}}, 3, v2 -; GCN-DAG: v_add_{{.*}}, 4, v3 -; GCN-DAG: v_add_{{.*}}, 5, v4 define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) { +; SI-LABEL: ps_mesa_v5i32: +; SI: ; %bb.0: +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 2, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 5, v4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v5i32: +; VI: ; %bb.0: +; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v4 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 5, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, store <5 x i32> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_v5f32: -; GCN-DAG: v_add_f32{{.*}}, 1.0, v0 -; GCN-DAG: v_add_f32{{.*}}, 2.0, v1 -; GCN-DAG: v_add_f32{{.*}}, 4.0, v2 -; GCN-DAG: v_add_f32{{.*}}, -1.0, v3 -; GCN-DAG: v_add_f32{{.*}}, 0.5, v4 define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) { +; SI-LABEL: ps_mesa_v5f32: +; SI: ; %bb.0: +; SI-NEXT: v_add_f32_e32 v3, -1.0, v3 +; SI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_add_f32_e32 v4, 0.5, v4 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_v5f32: +; VI: ; %bb.0: +; VI-NEXT: v_add_f32_e32 v3, -1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 2.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v4, 0.5, v4 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_v5f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, store <5 x float> %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_i16: -; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v0, v0 -; VI: v_add_u16_e32 v{{[0-9]+}}, v0, v0 define amdgpu_ps void @ps_mesa_i16(i16 %arg0) { +; SI-LABEL: ps_mesa_i16: +; SI: ; %bb.0: +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_i16: +; VI: ; %bb.0: +; VI-NEXT: v_add_u16_e32 v0, v0, v0 +; VI-NEXT: flat_store_short v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ps_mesa_inreg_i16: -; GCN: s_add_i32 s{{[0-9]+}}, s0, s0 define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) { +; SI-LABEL: ps_mesa_inreg_i16: +; SI: ; %bb.0: +; SI-NEXT: s_add_i32 s0, s0, s0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ps_mesa_inreg_i16: +; VI: ; %bb.0: +; VI-NEXT: s_and_b32 s0, 0xffff, s0 +; VI-NEXT: s_add_i32 s0, s0, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: flat_store_short v[0:1], v0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: ps_mesa_inreg_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 store i16 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}ret_ps_mesa_i16: -; GCN: s_movk_i32 s0, 0x7b define amdgpu_ps i16 @ret_ps_mesa_i16() { +; GCN-LABEL: ret_ps_mesa_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_movk_i32 s0, 0x7b +; GCN-NEXT: ; return to shader part epilog ret i16 123 } diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -1,12 +1,70 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s - -; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32: -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}} -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -19,9 +77,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -35,9 +151,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute0_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -51,9 +225,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute1_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -67,10 +299,70 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_constant_order_f32: -; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} -; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; SI-NEXT: v_min_f32_e32 v2, 2.0, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_max_f32_e32 v2, 4.0, v2 +; VI-NEXT: v_min_f32_e32 v2, 2.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -84,10 +376,83 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_multi_use_f32: -; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_max_f32_e32 v2, 2.0, v2 +; SI-NEXT: v_min_f32_e32 v3, 4.0, v2 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_max_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_min_f32_e32 v3, 4.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, 2.0, v1 +; GFX11-NEXT: v_min_f32_e32 v2, 4.0, v1 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -102,10 +467,71 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64: -; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0 -; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0 define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_fmed3_r_i_i_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; SI-NEXT: v_max_f64 v[2:3], v[2:3], 2.0 +; SI-NEXT: v_min_f64 v[2:3], v[2:3], 4.0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_r_i_i_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; VI-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_r_i_i_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_r_i_i_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr double, ptr addrspace(1) %out, i32 %tid @@ -119,9 +545,62 @@ ret void } -; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #2 { +; SI-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_med3_f32 v2, v3, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -134,9 +613,67 @@ ret void } -; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32: -; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 1.0, v3 +; VI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_legacy_fmed3_r_i_i_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -155,12 +692,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, -v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, -v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -178,12 +791,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, -v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, -v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, -v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, -v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -201,12 +890,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, -v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, -v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, -v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, -v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -224,12 +989,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]| define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -v2, |v3|, -|v4| +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -253,12 +1094,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]| define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, -|v2|, -|v3|, -|v4| +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -284,15 +1201,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] -; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] -; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -314,13 +1316,88 @@ ret void } - -; GCN-LABEL: {{^}}v_nnan_input_calls_med3_f32_pat0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_input_calls_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_input_calls_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -337,12 +1414,88 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_call_med3_f32_pat0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_call_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_call_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_call_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_call_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -359,12 +1512,88 @@ ret void } -; GCN-LABEL: {{^}}v_fast_call_med3_f32_pat0: -; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]] -; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_fast_call_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_fast_call_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_fast_call_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_fast_call_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -391,15 +1620,90 @@ ; 5: max(min(y, x), min(max(y, x), z)) ; 6: max(min(y, x), min(z, max(x, y))) ; 7: max(min(y, x), min(z, max(y, x))) -; ; + commute outermost max -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -416,12 +1720,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -438,12 +1818,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -460,12 +1916,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat3: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat3: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -482,12 +2014,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat4: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat4: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -504,12 +2112,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat5: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat5: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -526,12 +2210,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat6: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat6: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -548,12 +2308,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat7: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat7: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -570,12 +2406,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat8: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat8: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -592,12 +2504,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat9: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat9: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -614,12 +2602,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat10: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat10: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -636,12 +2700,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat11: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat11: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -658,12 +2798,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat12: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat12: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -680,12 +2896,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat13: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat13: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -702,12 +2994,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat14: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat14: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -724,12 +3092,88 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat15: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v3, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat15: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v2, v7, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -749,12 +3193,88 @@ ; Also handle `min` at the root: ; min(max(x, y), max(min(x, y), z)) -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat16: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]] define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_med3_f32 v2, v7, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -775,12 +3295,119 @@ ; Negative patterns ; --------------------------------------------------------------------- -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0: -; GCN-DAG: v_min_f32 -; GCN-DAG: v_max_f32 -; GCN: v_min_f32 -; GCN: v_max_f32 define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v5, off, s[8:11], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -798,8 +3425,120 @@ ret void } -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1: define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v1, v2 +; GFX11-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -817,8 +3556,119 @@ ret void } -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2: define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -836,9 +3686,109 @@ ret void } - -; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0: define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_test_safe_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_min_f32_e32 v5, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v2, v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_safe_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_load_dword v6, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v5, v4, v2 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v5, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_safe_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -855,8 +3805,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0: define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -878,8 +3920,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0: define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -901,8 +4035,100 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0: define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; VI-NEXT: v_med3_f32 v2, v4, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -924,15 +4150,98 @@ ret void } -; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN-DAG: v_min_f32 -; GCN-DAG: v_max_f32 -; GCN-DAG: v_min_f32 -; GCN-DAG: v_max_f32 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: buffer_load_dword v4, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_min_f32_e64 v5, -v2, v3 +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: v_max_f32_e32 v2, v5, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: flat_load_dword v6, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_min_f32_e64 v4, -v6, v2 +; VI-NEXT: v_max_f32_e32 v2, v6, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: v_max_f32_e32 v2, v4, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e64 v4, -v1, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v4, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_minmax_f32 v1, -v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -951,13 +4260,91 @@ } ; A simple min and max is not sufficient -; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32: -; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]] -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]] -; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]] define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #2 { +; SI-LABEL: v_test_global_nnans_min_max_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_max_f32_e32 v2, v2, v3 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_global_nnans_min_max_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_max_f32_e32 v2, v7, v2 +; VI-NEXT: v_min_f32_e32 v2, v2, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_global_nnans_min_max_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_min_max_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr float, ptr addrspace(1) %bptr, i32 %tid @@ -972,19 +4359,70 @@ ret void } -; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16: -; SI: v_cvt_f32_f16 -; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 -; SI: v_cvt_f16_f32 - -; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0 -; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0 -; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0 - -; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0 -; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_med3_f32 v2, v2, 2.0, 4.0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v2, 1.0, v3 +; VI-NEXT: v_max_f16_e32 v2, 2.0, v2 +; VI-NEXT: v_min_f16_e32 v2, 4.0, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid %outgep = getelementptr half, ptr addrspace(1) %out, i32 %tid @@ -997,31 +4435,108 @@ ret void } -; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0: -; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[B:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_ushort [[C:v[0-9]+]] - -; SI: v_cvt_f32_f16 -; SI: v_cvt_f32_f16 -; SI: v_add_f32_e32 -; SI: v_add_f32_e32 -; SI: v_add_f32_e32 -; SI: v_med3_f32 -; SI: v_cvt_f16_f32_e32 - - -; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] -; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] -; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] - -; VI-DAG: v_min_f16 -; VI-DAG: v_max_f16 -; VI: v_min_f16 -; VI: v_max_f16 - -; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #1 { +; SI-LABEL: v_nnan_inputs_med3_f16_pat0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[14:15], s[10:11] +; SI-NEXT: s_mov_b64 s[18:19], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[12:13], s[4:5] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[12:15], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v4, v[0:1], s[16:19], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v3, 2.0, v3 +; SI-NEXT: v_add_f32_e32 v4, 4.0, v4 +; SI-NEXT: v_med3_f32 v2, v2, v3, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_nnan_inputs_med3_f16_pat0: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v6, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_ushort v7, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v3, v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f16_e32 v4, 1.0, v7 +; VI-NEXT: v_add_f16_e32 v2, 2.0, v2 +; VI-NEXT: v_add_f16_e32 v3, 4.0, v3 +; VI-NEXT: v_min_f16_e32 v5, v4, v2 +; VI-NEXT: v_max_f16_e32 v2, v4, v2 +; VI-NEXT: v_min_f16_e32 v2, v2, v3 +; VI-NEXT: v_max_f16_e32 v2, v5, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr half, ptr addrspace(1) %aptr, i32 %tid %gep1 = getelementptr half, ptr addrspace(1) %bptr, i32 %tid @@ -1043,11 +4558,71 @@ ret void } -; GCN-LABEL: {{^}}two_non_inline_constant: -; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, -; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x41000000, [[ADD]] -; GCN: v_min_f32_e32 v{{[0-9]+}}, 0x41800000, [[MAX]] define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: two_non_inline_constant: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0.5, v2 +; SI-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; SI-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-LABEL: two_non_inline_constant: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-NEXT: v_max_f32_e32 v2, 0x41000000, v2 +; VI-NEXT: v_min_f32_e32 v2, 0x41800000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: two_non_inline_constant: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: two_non_inline_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s2, 0x41000000 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1061,11 +4636,83 @@ } ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. -; GCN-LABEL: {{^}}one_non_inline_constant: -; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000 -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]] define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: one_non_inline_constant: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x41800000, v2 +; SI-NEXT: v_med3_f32 v3, v4, 1.0, v3 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: one_non_inline_constant: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-NEXT: v_med3_f32 v2, v2, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x41800000, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: one_non_inline_constant: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1 +; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: one_non_inline_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1081,12 +4728,99 @@ ret void } -; GCN-LABEL: {{^}}two_non_inline_constant_multi_use: -; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000 -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000 -; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, -; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]] define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { +; SI-LABEL: two_non_inline_constant_multi_use: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0x41000000 +; SI-NEXT: v_mov_b32_e32 v3, 0x41800000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_f32_e32 v4, 0.5, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x41800000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x41000000, v2 +; SI-NEXT: v_med3_f32 v3, v4, s4, v3 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_store_dword v5, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_endpgm +; +; VI-LABEL: two_non_inline_constant_multi_use: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x41800000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b32 s2, 0x41000000 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0.5, v3 +; VI-NEXT: v_med3_f32 v2, v2, s2, v4 +; VI-NEXT: v_add_f32_e32 v5, 0x41800000, v3 +; VI-NEXT: v_add_f32_e32 v3, 0x41000000, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: two_non_inline_constant_multi_use: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x41000000 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x41800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX9-NEXT: v_med3_f32 v2, v3, s2, v2 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: two_non_inline_constant_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s2, 0x41000000 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v3, 0x41800000, v1 +; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x41000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_med3_f32 v2, v2, s2, 0x41800000 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, ptr addrspace(1) %aptr, i32 %tid %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid @@ -1117,3 +4851,5 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -1,11 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: not llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,GFX11 %s +; RUN: not llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,R600 %s ; FUNC-LABEL: {{^}}s_fneg_f32: ; R600: -PV -; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_load_{{dword|b32}} [[VAL:s[0-9]+]] ; GCN: s_xor_b32 [[NEG_VAL:s[0-9]+]], [[VAL]], 0x80000000 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[NEG_VAL]] define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { @@ -61,10 +62,11 @@ ; FUNC-LABEL: {{^}}fneg_free_f32: ; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GFX11: s_load_b32 [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN: s_xor_b32 [[RES:s[0-9]+]], [[NEG_VALUE]], 0x80000000 ; GCN: v_mov_b32_e32 [[V_RES:v[0-9]+]], [[RES]] -; GCN: buffer_store_dword [[V_RES]] +; GCN: buffer_store_{{dword|b32}} [[V_RES]] ; R600-NOT: XOR ; R600: -PV.W @@ -78,6 +80,7 @@ ; FUNC-LABEL: {{^}}fneg_fold_f32: ; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GFX11: s_load_{{dword|b32}} [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: xor ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { @@ -100,7 +103,7 @@ } ; FUNC-LABEL: {{^}}s_fneg_i32: -; GCN: s_load_dword [[IN:s[0-9]+]] +; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]] ; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000 ; GCN: v_mov_b32_e32 [[V_FNEG:v[0-9]+]], [[FNEG]] define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { @@ -111,6 +114,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i32: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 define i32 @v_fneg_i32(i32 %in) { @@ -119,7 +123,7 @@ } ; FUNC-LABEL: {{^}}s_fneg_i32_fp_use: -; GCN: s_load_dword [[IN:s[0-9]+]] +; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]] ; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]] define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { %fneg = xor i32 %in, -2147483648 @@ -131,6 +135,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i32_fp_use: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; GCN-NEXT: s_setpc_b64 define float @v_fneg_i32_fp_use(i32 %in) { @@ -150,6 +155,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i64: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: s_setpc_b64 define i64 @v_fneg_i64(i64 %in) { @@ -169,6 +175,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i64_fp_use: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 ; GCN-NEXT: s_setpc_b64 define double @v_fneg_i64_fp_use(i64 %in) { @@ -180,6 +187,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i16: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GCN-NEXT: s_setpc_b64 define i16 @v_fneg_i16(i16 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -1,16 +1,64 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}uitofp_i16_to_f16 -; GCN: buffer_load_ushort v[[A_I16:[0-9]+]] -; SI: v_cvt_f32_u32_e32 v[[A_F32:[0-9]+]], v[[A_I16]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] - -; VI: v_cvt_f16_u16_e32 v[[R_F16:[0-9]+]], v[[A_I16]] - -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @uitofp_i16_to_f16( +; SI-LABEL: uitofp_i16_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uitofp_i16_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: uitofp_i16_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -20,13 +68,65 @@ ret void } -; GCN-LABEL: {{^}}uitofp_i32_to_f16 -; GCN: buffer_load_dword v[[A_I32:[0-9]+]] -; GCN: v_cvt_f32_u32_e32 v[[A_I16:[0-9]+]], v[[A_I32]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @uitofp_i32_to_f16( +; SI-LABEL: uitofp_i32_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uitofp_i32_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: uitofp_i32_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -38,24 +138,74 @@ ; f16 = uitofp i64 is in uint_to_fp.i64.ll -; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16 -; GCN: buffer_load_dword - -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f16_f32_e32 -; SI: v_cvt_f16_f32_e32 -; SI-DAG: v_lshlrev_b32_e32 -; SI: v_or_b32_e32 - - -; VI-DAG: v_cvt_f16_u16_e32 -; VI-DAG: v_cvt_f16_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; VI: v_or_b32_e32 - -; GCN: buffer_store_dword -; GCN: s_endpgm define amdgpu_kernel void @uitofp_v2i16_to_v2f16( +; SI-LABEL: uitofp_v2i16_to_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uitofp_v2i16_to_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: uitofp_v2i16_to_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -65,25 +215,76 @@ ret void } -; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16 -; GCN: buffer_load_dwordx2 - -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f32_u32_e32 -; SI: v_cvt_f16_f32_e32 -; SI: v_cvt_f16_f32_e32 -; SI-DAG: v_lshlrev_b32_e32 -; SI: v_or_b32_e32 - -; VI-DAG: v_cvt_f32_u32_e32 -; VI-DAG: v_cvt_f32_u32_e32 -; VI-DAG: v_cvt_f16_f32_e32 -; VI-DAG: v_cvt_f16_f32_sdwa -; VI: v_or_b32_e32 - -; GCN: buffer_store_dword -; GCN: s_endpgm define amdgpu_kernel void @uitofp_v2i32_to_v2f16( +; SI-LABEL: uitofp_v2i32_to_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: uitofp_v2i32_to_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 +; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: uitofp_v2i32_to_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -93,15 +294,91 @@ ret void } -; GCN-LABEL: {{^}}s_uint_to_fp_i1_to_f16: -; GCN-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 1.0, {{v[0-9]+}} -; GCN-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}} -; GCN: s_xor_b64 [[R_CMP:s\[[0-9]+:[0-9]+\]]], [[CMP1]], [[CMP0]] -; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[R_CMP]] -; GCN-NEXT: v_cvt_f16_f32_e32 [[R_F16:v[0-9]+]], [[RESULT]] -; GCN: buffer_store_short -; GCN: s_endpgm define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; SI-LABEL: s_uint_to_fp_i1_to_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_uint_to_fp_i1_to_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: s_uint_to_fp_i1_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_le_f32_e64 s0, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -113,3 +390,5 @@ } ; f16 = uitofp i64 is in uint_to_fp.i64.ll +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/unpack-half.ll b/llvm/test/CodeGen/AMDGPU/unpack-half.ll --- a/llvm/test/CodeGen/AMDGPU/unpack-half.ll +++ b/llvm/test/CodeGen/AMDGPU/unpack-half.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s ; On gfx6 and gfx7, this test shows a bug in SelectionDAG where scalarizing the ; extension of a vector of f16 generates an illegal node that errors later. diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -1,14 +1,126 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s - -; GCN-LABEL: {{^}}mac_vvv: -; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 glc{{$}} -; GCN: buffer_load_dword [[B:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 -; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8 -; GCN: v_mac_f32_e32 [[C]], [[A]], [[B]] -; GCN: buffer_store_dword [[C]] +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s + define amdgpu_kernel void @mac_vvv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mac_vvv: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mac_vvv: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-FLUSH-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mac_vvv: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mac_vvv: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mac_vvv: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -23,10 +135,69 @@ ret void } -; GCN-LABEL: {{^}}mad_inline_sgpr_inline: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5 define amdgpu_kernel void @mad_inline_sgpr_inline(ptr addrspace(1) %out, float %in) #0 { +; SI-LABEL: mad_inline_sgpr_inline: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mad_f32 v0, s4, 0.5, 0.5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_inline_sgpr_inline: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, s4, 0.5, 0.5 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_inline_sgpr_inline: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dword s4, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mad_f32 v0, s4, 0.5, 0.5 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_inline_sgpr_inline: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e64 v0, s2, 0.5 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, 0.5, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_inline_sgpr_inline: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mul_f32_e64 v0, s2, 0.5 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0.5, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %tmp0 = fmul float 0.5, %in %tmp1 = fadd float %tmp0, 0.5 @@ -34,10 +205,107 @@ ret void } -; GCN-LABEL: {{^}}mad_vvs: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @mad_vvs(ptr addrspace(1) %out, ptr addrspace(1) %in, float %c) #0 { +; SI-LABEL: mad_vvs: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, v0, v1, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_vvs: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s6 +; VI-FLUSH-NEXT: s_mov_b32 s9, s7 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, v0, v1, s12 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_vvs: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, v0, v1, s12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_vvs: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX11-FLUSH-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_vvs: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 @@ -50,9 +318,107 @@ ret void } -; GCN-LABEL: {{^}}mac_ssv: -; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @mac_ssv(ptr addrspace(1) %out, ptr addrspace(1) %in, float %a) #0 { +; SI-LABEL: mac_ssv: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mac_f32_e64 v0, s12, s12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mac_ssv: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s2, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s2 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s6 +; VI-FLUSH-NEXT: s_mov_b32 s9, s7 +; VI-FLUSH-NEXT: s_mov_b32 s11, s3 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s0, s4 +; VI-FLUSH-NEXT: s_mov_b32 s1, s5 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f32_e64 v0, s12, s12 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mac_ssv: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s12, s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f32_e64 v0, s12, s12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mac_ssv: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2 +; GFX11-FLUSH-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s7 +; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FLUSH-NEXT: v_mul_f32_e64 v1, s0, s0 +; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mac_ssv: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: v_mul_f32_e64 v1, s0, s0 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %c = load float, ptr addrspace(1) %in @@ -62,10 +428,147 @@ ret void } -; GCN-LABEL: {{^}}mac_mad_same_add: -; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mac_mad_same_add(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mac_mad_same_add: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mad_f32 v0, v0, v2, v1 +; SI-NEXT: v_mac_f32_e32 v1, v3, v4 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mac_mad_same_add: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16 glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: v_mad_f32 v0, v0, v2, v1 +; VI-FLUSH-NEXT: v_mac_f32_e32 v1, v3, v4 +; VI-FLUSH-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mac_mad_same_add: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:8 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:12 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v4, off, s[8:11], 0 offset:16 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mad_f32 v0, v0, v2, v1 +; VI-NEXT: v_mac_f32_e32 v1, v3, v4 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mac_mad_same_add: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v3, off, s[8:11], 0 offset:12 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: buffer_load_b32 v4, off, s[8:11], 0 offset:16 glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_mul_f32 v1, v3, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mac_mad_same_add: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[8:11], 0 offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v3, off, s[8:11], 0 offset:12 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v4, off, s[8:11], 0 offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v1 :: v_dual_mul_f32 v1, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -92,11 +595,103 @@ ; There is no advantage to using v_mac when one of the operands is negated ; and v_mad accepts more operand types. - -; GCN-LABEL: {{^}}mad_neg_src0: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @mad_neg_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mad_neg_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_neg_src0: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_neg_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_neg_src0: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_neg_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -113,10 +708,103 @@ ret void } -; GCN-LABEL: {{^}}nsz_mad_sub0_src0: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; SI-LABEL: nsz_mad_sub0_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: nsz_mad_sub0_src0: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: nsz_mad_sub0_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: nsz_mad_sub0_src0: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: nsz_mad_sub0_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -133,10 +821,109 @@ ret void } -; GCN-LABEL: {{^}}safe_mad_sub0_src0: -; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0, -; GCN: v_ma{{[cd]}}_f32{{[_e32]*}} v{{[0-9]+}}, [[SUB0]], v{{[0-9]+}} define amdgpu_kernel void @safe_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: safe_mad_sub0_src0: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, 0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: safe_mad_sub0_src0: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_sub_f32_e32 v0, 0, v0 +; VI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-FLUSH-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: safe_mad_sub0_src0: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_f32_e32 v0, 0, v0 +; VI-NEXT: v_mac_f32_e32 v2, v0, v1 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: safe_mad_sub0_src0: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, 0, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: safe_mad_sub0_src0: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_f32_e32 v0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -153,10 +940,103 @@ ret void } -; GCN-LABEL: {{^}}mad_neg_src1: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @mad_neg_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mad_neg_src1: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_neg_src1: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_neg_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_neg_src1: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_neg_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -173,10 +1053,103 @@ ret void } -; GCN-LABEL: {{^}}nsz_mad_sub0_src1: -; GCN-NOT: v_mac_f32 -; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +; SI-LABEL: nsz_mad_sub0_src1: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: nsz_mad_sub0_src1: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: nsz_mad_sub0_src1: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: nsz_mad_sub0_src1: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: nsz_mad_sub0_src1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -193,10 +1166,103 @@ ret void } -; GCN-LABEL: {{^}}mad_neg_src2: -; GCN-NOT: v_mac -; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} define amdgpu_kernel void @mad_neg_src2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: mad_neg_src2: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:8 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: mad_neg_src2: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; VI-FLUSH-NEXT: s_mov_b32 s6, -1 +; VI-FLUSH-NEXT: s_mov_b32 s10, s6 +; VI-FLUSH-NEXT: s_mov_b32 s11, s7 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: s_mov_b32 s8, s2 +; VI-FLUSH-NEXT: s_mov_b32 s9, s3 +; VI-FLUSH-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-FLUSH-NEXT: s_mov_b32 s4, s0 +; VI-FLUSH-NEXT: s_mov_b32 s5, s1 +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mad_f32 v0, v0, v1, -v2 +; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: mad_neg_src2: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: mad_neg_src2: +; GFX11-FLUSH: ; %bb.0: ; %entry +; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_mov_b32 s6, -1 +; GFX11-FLUSH-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLUSH-NEXT: s_mov_b32 s10, s6 +; GFX11-FLUSH-NEXT: s_mov_b32 s11, s7 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_mov_b32 s8, s2 +; GFX11-FLUSH-NEXT: s_mov_b32 s9, s3 +; GFX11-FLUSH-NEXT: s_mov_b32 s4, s0 +; GFX11-FLUSH-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-FLUSH-NEXT: s_mov_b32 s5, s1 +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: mad_neg_src2: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -215,14 +1281,137 @@ ; Without special casing the inline constant check for v_mac_f32's ; src2, this fails to fold the 1.0 into a mad. - -; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f32: -; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] - -; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] -; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #3 { +; SI-LABEL: fold_inline_imm_into_mac_src2_f32: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_add_f32_e32 v4, v2, v2 +; SI-NEXT: v_add_f32_e32 v5, v3, v3 +; SI-NEXT: v_mad_f32 v4, v4, -4.0, 1.0 +; SI-NEXT: v_add_f32_e32 v3, v4, v3 +; SI-NEXT: v_mad_f32 v2, -v5, v2, 1.0 +; SI-NEXT: v_mac_f32_e32 v3, 0x41000000, v2 +; SI-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f32: +; VI-FLUSH: ; %bb.0: ; %bb +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-FLUSH-NEXT: flat_load_dword v5, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_dword v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_f32_e32 v3, v5, v5 +; VI-FLUSH-NEXT: v_add_f32_e32 v4, v2, v2 +; VI-FLUSH-NEXT: v_mad_f32 v3, v3, -4.0, 1.0 +; VI-FLUSH-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-FLUSH-NEXT: v_mad_f32 v3, -v4, v5, 1.0 +; VI-FLUSH-NEXT: v_mac_f32_e32 v2, 0x41000000, v3 +; VI-FLUSH-NEXT: flat_store_dword v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: fold_inline_imm_into_mac_src2_f32: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f32_e32 v3, v5, v5 +; VI-NEXT: v_add_f32_e32 v4, v2, v2 +; VI-NEXT: v_mad_f32 v3, v3, -4.0, 1.0 +; VI-NEXT: v_add_f32_e32 v2, v3, v2 +; VI-NEXT: v_mad_f32 v3, -v4, v5, 1.0 +; VI-NEXT: v_mac_f32_e32 v2, 0x41000000, v3 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f32: +; GFX11-FLUSH: ; %bb.0: ; %bb +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_dual_add_f32 v3, v1, v1 :: v_dual_add_f32 v4, v2, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v1, v4, v1 :: v_dual_add_f32 v2, v3, v2 +; GFX11-FLUSH-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v1, 0x41000000, v1 +; GFX11-FLUSH-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: fold_inline_imm_into_mac_src2_f32: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v3, v1, v1 :: v_dual_add_f32 v4, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v1, v4, v1 :: v_dual_add_f32 v2, v3, v2 +; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, 0x41000000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -244,20 +1433,149 @@ ret void } -; GCN-LABEL: {{^}}fold_inline_imm_into_mac_src2_f16: -; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort [[B:v[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[A]] -; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[B]] - -; SI: v_add_f32_e32 [[TMP2:v[0-9]+]], [[CVT_A]], [[CVT_A]] -; SI: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 -; SI: v_mac_f32_e32 v{{[0-9]+}}, 0x41000000, v{{[0-9]+}} - -; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]] -; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0 define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) #3 { +; SI-LABEL: fold_inline_imm_into_mac_src2_f16: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, v2, v2 +; SI-NEXT: v_add_f32_e32 v5, v3, v3 +; SI-NEXT: v_mad_f32 v4, v4, -4.0, 1.0 +; SI-NEXT: v_add_f32_e32 v3, v4, v3 +; SI-NEXT: v_mad_f32 v2, -v5, v2, 1.0 +; SI-NEXT: v_mac_f32_e32 v3, 0x41000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_endpgm +; +; VI-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f16: +; VI-FLUSH: ; %bb.0: ; %bb +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-FLUSH-NEXT: flat_load_ushort v5, v[0:1] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: v_add_f16_e32 v3, v5, v5 +; VI-FLUSH-NEXT: v_add_f16_e32 v4, v2, v2 +; VI-FLUSH-NEXT: v_mad_f16 v3, v3, -4.0, 1.0 +; VI-FLUSH-NEXT: v_add_f16_e32 v2, v3, v2 +; VI-FLUSH-NEXT: v_mad_f16 v3, -v4, v5, 1.0 +; VI-FLUSH-NEXT: v_mac_f16_e32 v2, 0x4800, v3 +; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-LABEL: fold_inline_imm_into_mac_src2_f16: +; VI: ; %bb.0: ; %bb +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_ushort v2, v[2:3] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_f16_e32 v3, v5, v5 +; VI-NEXT: v_add_f16_e32 v4, v2, v2 +; VI-NEXT: v_mul_f16_e32 v3, -4.0, v3 +; VI-NEXT: v_mul_f16_e32 v4, v4, v5 +; VI-NEXT: v_add_f16_e32 v3, 1.0, v3 +; VI-NEXT: v_sub_f16_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f16_e32 v2, v3, v2 +; VI-NEXT: v_mul_f16_e32 v3, 0x4800, v4 +; VI-NEXT: v_add_f16_e32 v2, v2, v3 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX11-FLUSH-LABEL: fold_inline_imm_into_mac_src2_f16: +; GFX11-FLUSH: ; %bb.0: ; %bb +; GFX11-FLUSH-NEXT: s_clause 0x1 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v3, v1, v1 +; GFX11-FLUSH-NEXT: v_add_f16_e32 v4, v2, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v3, -4.0, v3 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v4, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v3, 1.0, v3 +; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, 1.0, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v2, v3, v2 +; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, 0x4800, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-FLUSH-NEXT: s_endpgm +; +; GFX11-LABEL: fold_inline_imm_into_mac_src2_f16: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v3, v1, v1 +; GFX11-NEXT: v_add_f16_e32 v4, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f16_e32 v3, -4.0, v3 +; GFX11-NEXT: v_mul_f16_e32 v1, v4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v3, 1.0, v3 +; GFX11-NEXT: v_sub_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_mul_f16_e32 v1, 0x4800, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -285,3 +1603,5 @@ attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,20 +1,102 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s - -; GCN-LABEL: {{^}}mac_f16: -; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]] -; SI: v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]] -; SI: buffer_store_short v[[R_F16]] -; VI: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]] -; VI: buffer_store_short v[[C_F16]] -; GCN: s_endpgm +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX11 %s + define amdgpu_kernel void @mac_f16( +; SI-LABEL: mac_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -31,14 +113,153 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_same_add: -; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; SI: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} - -; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] -; VI: v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_same_add( +; SI-LABEL: mac_f16_same_add: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s20, s8 +; SI-NEXT: s_mov_b32 s21, s9 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s24, s12 +; SI-NEXT: s_mov_b32 s25, s13 +; SI-NEXT: s_mov_b32 s26, s2 +; SI-NEXT: s_mov_b32 s27, s3 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[20:23], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 +; SI-NEXT: buffer_load_ushort v3, off, s[24:27], 0 +; SI-NEXT: buffer_load_ushort v4, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_mad_f32 v1, v1, v2, v3 +; SI-NEXT: v_mac_f32_e32 v3, v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v1, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_same_add: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x54 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s24, s12 +; VI-NEXT: s_mov_b32 s25, s13 +; VI-NEXT: s_mov_b32 s26, s2 +; VI-NEXT: s_mov_b32 s27, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[20:23], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[24:27], 0 +; VI-NEXT: buffer_load_ushort v3, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v4, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mad_f16 v0, v0, v1, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v4, v3 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_same_add: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[16:17], s[0:1], 0x54 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_mov_b32 s26, s2 +; GFX11-NEXT: s_mov_b32 s27, s3 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s30, s2 +; GFX11-NEXT: s_mov_b32 s31, s3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s20, s8 +; GFX11-NEXT: s_mov_b32 s21, s9 +; GFX11-NEXT: s_mov_b32 s24, s10 +; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: s_mov_b32 s36, s14 +; GFX11-NEXT: s_mov_b32 s37, s15 +; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 +; GFX11-NEXT: buffer_load_u16 v3, off, s[36:39], 0 +; GFX11-NEXT: s_mov_b32 s28, s12 +; GFX11-NEXT: s_mov_b32 s29, s13 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: buffer_load_u16 v4, off, s[28:31], 0 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v1, v1, v4 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v4 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, @@ -64,16 +285,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a( +; SI-LABEL: mac_f16_neg_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -91,16 +396,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b( +; SI-LABEL: mac_f16_neg_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -118,16 +507,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c: -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_cvt_f32_f16_e32 -; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_c( +; SI-LABEL: mac_f16_neg_c: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_c: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, v0, v1, -v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -145,13 +618,105 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math( +; SI-LABEL: mac_f16_neg_a_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v0, 0, v0 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_a_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v0, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_a_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_sub_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -169,13 +734,105 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] -; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]] -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math( +; SI-LABEL: mac_f16_neg_b_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v1, 0, v1 +; SI-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_b_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_sub_f16_e32 v1, 0, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: buffer_store_short v2, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_b_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s16, s2 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_sub_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f16_e32 v0, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -193,13 +850,104 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_safe_fp_math: -; SI: v_sub_f32_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_mac_f32_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math( +; SI-LABEL: mac_f16_neg_c_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v1, 0, v1 +; SI-NEXT: v_mac_f32_e32 v1, v0, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_c_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_sub_f16_e32 v1, 0, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_e32 v1, v0, v2 +; VI-NEXT: buffer_store_short v1, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_c_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f16_e32 v1, 0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -217,16 +965,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math( +; SI-LABEL: mac_f16_neg_a_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_a_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_a_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -244,16 +1076,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math( +; SI-LABEL: mac_f16_neg_b_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_b_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, -v0, v1, v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_b_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -271,16 +1187,100 @@ ret void } -; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]] - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math( +; SI-LABEL: mac_f16_neg_c_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_f16_neg_c_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mad_f16 v0, v0, v1, -v2 +; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_f16_neg_c_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -298,41 +1298,123 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16: -; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]] - -; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] - -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] - -; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]] - -; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]] -; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]] -; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; VI-NOT: and -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]] - -; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]] -; VI-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]] -; VI-NOT: and -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]] - -; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]] -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16( +; SI-LABEL: mac_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_barrier +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_barrier +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_barrier +; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_barrier +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_mac_f16_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mac_f16_e32 v2, v0, v1 +; VI-NEXT: v_or_b32_e32 v0, v2, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_barrier +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_barrier +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -351,19 +1433,179 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_same_add: -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_same_add( +; SI-LABEL: mac_v2f16_same_add: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s22, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s20, s8 +; SI-NEXT: s_mov_b32 s21, s9 +; SI-NEXT: s_mov_b32 s23, s3 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s24, s12 +; SI-NEXT: s_mov_b32 s25, s13 +; SI-NEXT: s_mov_b32 s26, s2 +; SI-NEXT: s_mov_b32 s27, s3 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s12, s14 +; SI-NEXT: buffer_load_dword v1, off, s[20:23], 0 +; SI-NEXT: s_mov_b32 s13, s15 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: buffer_load_dword v3, off, s[24:27], 0 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_mad_f32 v6, v6, v7, v8 +; SI-NEXT: v_mad_f32 v1, v1, v2, v3 +; SI-NEXT: v_mac_f32_e32 v8, v9, v5 +; SI-NEXT: v_mac_f32_e32 v3, v4, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v2, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_same_add: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x54 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s22, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s20, s8 +; VI-NEXT: s_mov_b32 s21, s9 +; VI-NEXT: s_mov_b32 s23, s3 +; VI-NEXT: s_mov_b32 s8, s10 +; VI-NEXT: s_mov_b32 s9, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s24, s12 +; VI-NEXT: s_mov_b32 s25, s13 +; VI-NEXT: s_mov_b32 s26, s2 +; VI-NEXT: s_mov_b32 s27, s3 +; VI-NEXT: s_mov_b32 s12, s14 +; VI-NEXT: s_mov_b32 s13, s15 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s18, s2 +; VI-NEXT: s_mov_b32 s19, s3 +; VI-NEXT: buffer_load_dword v0, off, s[20:23], 0 +; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v2, off, s[24:27], 0 +; VI-NEXT: buffer_load_dword v3, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mac_f16_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mad_f16 v6, v0, v1, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; VI-NEXT: v_mac_f16_e32 v2, v4, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_same_add: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[16:17], s[0:1], 0x54 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s2 +; GFX11-NEXT: s_mov_b32 s19, s3 +; GFX11-NEXT: s_mov_b32 s22, s2 +; GFX11-NEXT: s_mov_b32 s23, s3 +; GFX11-NEXT: s_mov_b32 s26, s2 +; GFX11-NEXT: s_mov_b32 s27, s3 +; GFX11-NEXT: s_mov_b32 s38, s2 +; GFX11-NEXT: s_mov_b32 s39, s3 +; GFX11-NEXT: s_mov_b32 s30, s2 +; GFX11-NEXT: s_mov_b32 s31, s3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s20, s8 +; GFX11-NEXT: s_mov_b32 s21, s9 +; GFX11-NEXT: s_mov_b32 s24, s10 +; GFX11-NEXT: s_mov_b32 s25, s11 +; GFX11-NEXT: s_mov_b32 s36, s14 +; GFX11-NEXT: s_mov_b32 s37, s15 +; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[24:27], 0 +; GFX11-NEXT: buffer_load_b32 v3, off, s[36:39], 0 +; GFX11-NEXT: s_mov_b32 s28, s12 +; GFX11-NEXT: s_mov_b32 s29, s13 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: buffer_load_b32 v4, off, s[28:31], 0 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_pk_mul_f16 v1, v1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_add_f16 v1, v1, v4 +; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, @@ -389,18 +1631,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} - -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a( +; SI-LABEL: mac_v2f16_neg_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -418,18 +1760,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b( +; SI-LABEL: mac_v2f16_neg_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -447,22 +1889,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c( +; SI-LABEL: mac_v2f16_neg_c: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v4, -v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, v3, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_c: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, v4, v3, -v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, v1, v0, -v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_c: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -480,21 +2018,123 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} - -; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math( +; SI-LABEL: mac_v2f16_neg_a_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v3, 0, v3 +; SI-NEXT: v_sub_f32_e32 v0, 0, v0 +; SI-NEXT: v_mac_f32_e32 v5, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_mac_f32_e32 v2, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_a_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v4, 0, v0 +; VI-NEXT: v_sub_f16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_mac_f16_e32 v1, v4, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_a_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 0 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -512,21 +2152,123 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] -; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] - -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math( +; SI-LABEL: mac_v2f16_neg_b_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v4, 0, v4 +; SI-NEXT: v_sub_f32_e32 v1, 0, v1 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_b_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v4, 0, v0 +; VI-NEXT: v_sub_f16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_mac_f16_e32 v1, v2, v4 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_b_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s16, s2 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_mov_b32 s17, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 0 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -544,21 +2286,119 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math: - -; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} - -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} - -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math( +; SI-LABEL: mac_v2f16_neg_c_safe_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v5, 0, v5 +; SI-NEXT: v_sub_f32_e32 v2, 0, v2 +; SI-NEXT: v_mac_f32_e32 v5, v0, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 +; SI-NEXT: v_mac_f32_e32 v2, v3, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_c_safe_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[16:19], 0 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_sub_f16_e32 v4, 0, v0 +; VI-NEXT: v_sub_f16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f16_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_mac_f16_e32 v4, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_c_safe_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s22, s10 +; GFX11-NEXT: s_mov_b32 s23, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s20, s6 +; GFX11-NEXT: s_mov_b32 s21, s7 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_add_f16 v1, v1, 0 neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -576,22 +2416,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math( +; SI-LABEL: mac_v2f16_neg_a_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_a_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_a_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -609,22 +2545,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; VI: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math( +; SI-LABEL: mac_v2f16_neg_b_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, -v0, v4, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, -v3, v1, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_b_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, -v4, v3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, -v1, v0, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_b_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -642,22 +2674,118 @@ ret void } -; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math: -; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}} -; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}} - -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} - -; VI-NOT: v_mac_f16 -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; VI: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} -; GCN: s_endpgm define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math( +; SI-LABEL: mac_v2f16_neg_c_nsz_fp_math: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s16, s4 +; SI-NEXT: s_mov_b32 s17, s5 +; SI-NEXT: s_mov_b32 s18, s10 +; SI-NEXT: s_mov_b32 s19, s11 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_mad_f32 v0, v0, v4, -v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_mad_f32 v1, v3, v1, -v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: mac_v2f16_neg_c_nsz_fp_math: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s18, s10 +; VI-NEXT: s_mov_b32 s19, s11 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s16, s4 +; VI-NEXT: s_mov_b32 s17, s5 +; VI-NEXT: s_mov_b32 s14, s10 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s4, s6 +; VI-NEXT: s_mov_b32 s5, s7 +; VI-NEXT: s_mov_b32 s6, s10 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s8, s0 +; VI-NEXT: s_mov_b32 s9, s1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_mad_f16 v3, v4, v3, -v5 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_mad_f16 v0, v1, v0, -v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: s_endpgm +; +; GFX11-LABEL: mac_v2f16_neg_c_nsz_fp_math: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s18, s10 +; GFX11-NEXT: s_mov_b32 s19, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s2 +; GFX11-NEXT: s_mov_b32 s13, s3 +; GFX11-NEXT: s_mov_b32 s16, s4 +; GFX11-NEXT: s_mov_b32 s17, s5 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s10 +; GFX11-NEXT: s_mov_b32 s7, s11 +; GFX11-NEXT: s_mov_b32 s8, s0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-NEXT: s_mov_b32 s9, s1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -680,3 +2808,5 @@ attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #2 = { nounwind convergent } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: @@ -50,6 +51,32 @@ ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: madak_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { @@ -137,6 +164,44 @@ ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 ; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: madak_f16_use_2: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s14, -1 +; GFX11-NEXT: s_mov_b32 s15, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s14 +; GFX11-NEXT: s_mov_b32 s19, s15 +; GFX11-NEXT: s_mov_b32 s22, s14 +; GFX11-NEXT: s_mov_b32 s23, s15 +; GFX11-NEXT: s_mov_b32 s2, s14 +; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s16, s8 +; GFX11-NEXT: s_mov_b32 s17, s9 +; GFX11-NEXT: s_mov_b32 s20, s10 +; GFX11-NEXT: s_mov_b32 s21, s11 +; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s0, s6 +; GFX11-NEXT: s_mov_b32 s1, s7 +; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1 +; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s +; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load: @@ -32,6 +33,21 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %add = add i16 %load, 999 %or = or i16 %add, 4 @@ -71,6 +87,23 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant_load_zext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %ext = zext i16 %load to i32 %add = add i32 %ext, 999 @@ -111,6 +144,23 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant_load_sext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %ext = sext i16 %load to i32 %add = add i32 %ext, 999 @@ -162,6 +212,27 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i17_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s0, 34 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_and_b32 s0, s0, 0x1ffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v[0:1], v4, off +; GFX11-NEXT: global_store_d16_hi_b8 v[2:3], v5, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i17, ptr addrspace(4) %arg, align 4 %add = add i17 %load, 34 %or = or i17 %add, 4 @@ -197,6 +268,19 @@ ; VI-NEXT: v_add_f16_e64 v2, s0, 4.0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_f16_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load half, ptr addrspace(4) %arg, align 4 %add = fadd half %load, 4.0 store half %add, ptr addrspace(1) null @@ -245,6 +329,28 @@ ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_v2i8_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v0, s0, 12 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x2c00 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, 0x300, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %arg, align 4 %add = add <2 x i8> %load, %or = or <2 x i8> %add, @@ -288,6 +394,22 @@ ; VI-NEXT: v_or_b32_e32 v2, 4, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: no_widen_i16_constant_divergent_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, 4, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 %gep.arg = getelementptr inbounds i16, ptr addrspace(4) %arg, i64 %tid.ext @@ -326,6 +448,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i1_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 %and = and i1 %load, true store i1 %and, ptr addrspace(1) null @@ -364,6 +500,23 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_zextload_i64_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %zext = zext i16 %load to i32 %add = add i32 %zext, 999 @@ -406,6 +559,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i1_zext_to_i64_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, 0x3e7 +; GFX11-NEXT: s_addc_u32 s1, 0, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 %zext = zext i1 %load to i64 %add = add i64 %zext, 999 @@ -444,6 +613,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant32_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(6) %arg, align 4 %add = add i16 %load, 999 %or = or i16 %add, 4 @@ -481,6 +666,21 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_global_invariant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0 %add = add i16 %load, 999 %or = or i16 %add, 1