diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX89 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s declare void @external_void_func_void() #0 @@ -178,6 +179,7 @@ ; GCN: s_swappc ; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] ; GFX89-DAG: buffer_store_dwordx3 v[0:2] +; GFX11-DAG: buffer_store_b96 v[0:2] define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { %val = call <3 x i32> @external_v3i32_func_void() store volatile <3 x i32> %val, ptr addrspace(1) undef, align 8 @@ -197,6 +199,8 @@ ; GFX7-DAG: flat_store_dword {{.*}}, v4 ; GFX89-DAG: buffer_store_dwordx4 v[0:3] ; GFX89-DAG: buffer_store_dword v4 +; GFX11-DAG: buffer_store_b128 v[0:3] +; GFX11-DAG: buffer_store_b32 v4 define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { %val = call <5 x i32> @external_v5i32_func_void() store volatile <5 x i32> %val, ptr addrspace(1) undef, align 8 @@ -254,7 +258,7 @@ ; GCN-LABEL: {{^}}test_call_external_v2i24_func_void: ; GCN: s_swappc_b64 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 +; GCN: v_add_{{(nc_)?}}{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 define amdgpu_kernel void @test_call_external_v2i24_func_void() #0 { %val = call <2 x i24> @external_v2i24_func_void() %elt0 = extractelement <2 x i24> %val, i32 0 @@ -268,6 +272,7 @@ ; GCN: s_swappc ; GFX7-DAG: flat_store_dwordx3 {{.*}}, v[0:2] ; GFX89-DAG: buffer_store_dwordx3 v[0:2] +; GFX11-DAG: buffer_store_b96 v[0:2] define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { %val = call <3 x float> @external_v3f32_func_void() store volatile <3 x float> %val, ptr addrspace(1) undef @@ -280,6 +285,8 @@ ; GFX7-DAG: flat_store_dword {{.*}}, v4 ; GFX89-DAG: buffer_store_dwordx4 v[0:3] ; GFX89-DAG: buffer_store_dword v4 +; GFX11-DAG: buffer_store_b128 v[0:3] +; GFX11-DAG: buffer_store_b32 v4 define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 { %val = call <5 x float> @external_v5f32_func_void() store volatile <5 x float> %val, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -1,11 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: not llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=R600 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,GCN,GFX11 %s +; RUN: not llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=FUNC,R600 %s ; FUNC-LABEL: {{^}}s_fneg_f32: ; R600: -PV -; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_load_{{dword|b32}} [[VAL:s[0-9]+]] ; GCN: s_xor_b32 [[NEG_VAL:s[0-9]+]], [[VAL]], 0x80000000 ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[NEG_VAL]] define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { @@ -61,10 +62,11 @@ ; FUNC-LABEL: {{^}}fneg_free_f32: ; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GFX11: s_load_b32 [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN: s_xor_b32 [[RES:s[0-9]+]], [[NEG_VALUE]], 0x80000000 ; GCN: v_mov_b32_e32 [[V_RES:v[0-9]+]], [[RES]] -; GCN: buffer_store_dword [[V_RES]] +; GCN: buffer_store_{{dword|b32}} [[V_RES]] ; R600-NOT: XOR ; R600: -PV.W @@ -78,6 +80,7 @@ ; FUNC-LABEL: {{^}}fneg_fold_f32: ; SI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c +; GFX11: s_load_{{dword|b32}} [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: xor ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { @@ -100,7 +103,7 @@ } ; FUNC-LABEL: {{^}}s_fneg_i32: -; GCN: s_load_dword [[IN:s[0-9]+]] +; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]] ; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000 ; GCN: v_mov_b32_e32 [[V_FNEG:v[0-9]+]], [[FNEG]] define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { @@ -111,6 +114,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i32: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 define i32 @v_fneg_i32(i32 %in) { @@ -119,7 +123,7 @@ } ; FUNC-LABEL: {{^}}s_fneg_i32_fp_use: -; GCN: s_load_dword [[IN:s[0-9]+]] +; GCN: s_load_{{dword|b32}} [[IN:s[0-9]+]] ; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]] define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { %fneg = xor i32 %in, -2147483648 @@ -131,6 +135,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i32_fp_use: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; GCN-NEXT: s_setpc_b64 define float @v_fneg_i32_fp_use(i32 %in) { @@ -150,6 +155,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i64: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-NEXT: s_setpc_b64 define i64 @v_fneg_i64(i64 %in) { @@ -169,6 +175,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i64_fp_use: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 ; GCN-NEXT: s_setpc_b64 define double @v_fneg_i64_fp_use(i64 %in) { @@ -180,6 +187,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i16: ; GCN: s_waitcnt +; GFX11: s_waitcnt_vscnt ; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 ; GCN-NEXT: s_setpc_b64 define i16 @v_fneg_i16(i16 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/unpack-half.ll b/llvm/test/CodeGen/AMDGPU/unpack-half.ll --- a/llvm/test/CodeGen/AMDGPU/unpack-half.ll +++ b/llvm/test/CodeGen/AMDGPU/unpack-half.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s ; On gfx6 and gfx7, this test shows a bug in SelectionDAG where scalarizing the ; extension of a vector of f16 generates an illegal node that errors later. @@ -23,4 +24,3 @@ } attributes #0 = { nounwind } - diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: @@ -50,6 +51,32 @@ ; VI-NEXT: v_madak_f16 v0, v0, v1, 0x4900 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: madak_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { @@ -137,6 +164,44 @@ ; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 ; VI-NEXT: buffer_store_short v3, off, s[8:11], 0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: madak_f16_use_2: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44 +; GFX11-NEXT: s_mov_b32 s14, -1 +; GFX11-NEXT: s_mov_b32 s15, 0x31016000 +; GFX11-NEXT: s_mov_b32 s18, s14 +; GFX11-NEXT: s_mov_b32 s19, s15 +; GFX11-NEXT: s_mov_b32 s22, s14 +; GFX11-NEXT: s_mov_b32 s23, s15 +; GFX11-NEXT: s_mov_b32 s2, s14 +; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s16, s8 +; GFX11-NEXT: s_mov_b32 s17, s9 +; GFX11-NEXT: s_mov_b32 s20, s10 +; GFX11-NEXT: s_mov_b32 s21, s11 +; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s4 +; GFX11-NEXT: s_mov_b32 s13, s5 +; GFX11-NEXT: s_mov_b32 s0, s6 +; GFX11-NEXT: s_mov_b32 s1, s7 +; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1 +; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=SI %s ; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s +; RUN: llc -amdgpu-codegenprepare-widen-constant-loads=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) { ; SI-LABEL: widen_i16_constant_load: @@ -32,6 +33,21 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %add = add i16 %load, 999 %or = or i16 %add, 4 @@ -71,6 +87,23 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant_load_zext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %ext = zext i16 %load to i32 %add = add i32 %ext, 999 @@ -111,6 +144,23 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant_load_sext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %ext = sext i16 %load to i32 %add = add i32 %ext, 999 @@ -162,6 +212,27 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i17_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s0, s0, 34 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_and_b32 s0, s0, 0x1ffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v[0:1], v4, off +; GFX11-NEXT: global_store_d16_hi_b8 v[2:3], v5, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i17, ptr addrspace(4) %arg, align 4 %add = add i17 %load, 34 %or = or i17 %add, 4 @@ -197,6 +268,19 @@ ; VI-NEXT: v_add_f16_e64 v2, s0, 4.0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_f16_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load half, ptr addrspace(4) %arg, align 4 %add = fadd half %load, 4.0 store half %add, ptr addrspace(1) null @@ -246,6 +330,28 @@ ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_v2i8_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v0, s0, 12 +; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x2c00 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, 0x300, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %arg, align 4 %add = add <2 x i8> %load, %or = or <2 x i8> %add, @@ -289,6 +395,22 @@ ; VI-NEXT: v_or_b32_e32 v2, 4, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: no_widen_i16_constant_divergent_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, 4, v2 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = zext i32 %tid to i64 %gep.arg = getelementptr inbounds i16, ptr addrspace(4) %arg, i64 %tid.ext @@ -327,6 +449,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i1_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 %and = and i1 %load, true store i1 %and, ptr addrspace(1) null @@ -365,6 +501,23 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_zextload_i64_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 %zext = zext i16 %load to i32 %add = add i32 %zext, 999 @@ -407,6 +560,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i1_zext_to_i64_constant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s0, 0x3e7 +; GFX11-NEXT: s_addc_u32 s1, 0, 0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 %zext = zext i1 %load to i64 %add = add i64 %zext, 999 @@ -445,6 +614,22 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_constant32_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(6) %arg, align 4 %add = add i16 %load, 999 %or = or i16 %add, 4 @@ -482,6 +667,21 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: widen_i16_global_invariant_load: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_addk_i32 s0, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0 %add = add i16 %load, 999 %or = or i16 %add, 1