diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX78,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX78,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s define i16 @v_powi_f16(i16 %l, i32 %r) { ; GFX7-LABEL: v_powi_f16: @@ -26,6 +27,23 @@ ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %l.cast = bitcast i16 %l to half %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r) %res.cast = bitcast half %res to i16 @@ -33,129 +51,242 @@ } define float @v_powi_f32(float %l, i32 %r) { -; GCN-LABEL: v_powi_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 %r) ret float %res } define float @v_powi_0_f32(float %l) { -; GCN-LABEL: v_powi_0_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 1.0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_0_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 0) ret float %res } define float @v_powi_1_f32(float %l) { -; GCN-LABEL: v_powi_1_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_1_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 1) ret float %res } define float @v_powi_neg1_f32(float %l) { -; GCN-LABEL: v_powi_neg1_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_neg1_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -1.0, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_neg1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -1) ret float %res } define float @v_powi_2_f32(float %l) { -; GCN-LABEL: v_powi_2_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_2_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 2) ret float %res } define float @v_powi_neg2_f32(float %l) { -; GCN-LABEL: v_powi_neg2_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_neg2_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, -2.0, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_neg2_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, -2.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -2) ret float %res } define float @v_powi_4_f32(float %l) { -; GCN-LABEL: v_powi_4_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_4_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 4.0, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_4_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 4) ret float %res } define float @v_powi_8_f32(float %l) { -; GCN-LABEL: v_powi_8_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_8_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41000000, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_8_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41000000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 8) ret float %res } define float @v_powi_16_f32(float %l) { -; GCN-LABEL: v_powi_16_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_16_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x41800000, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_16_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x41800000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 16) ret float %res } define float @v_powi_128_f32(float %l) { -; GCN-LABEL: v_powi_128_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_128_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0x43000000, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_128_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0x43000000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 128) ret float %res } define float @v_powi_neg128_f32(float %l) { -; GCN-LABEL: v_powi_neg128_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_log_f32_e32 v0, v0 -; GCN-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0 -; GCN-NEXT: v_exp_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX78-LABEL: v_powi_neg128_f32: +; GFX78: ; %bb.0: +; GFX78-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX78-NEXT: v_log_f32_e32 v0, v0 +; GFX78-NEXT: v_mul_legacy_f32_e32 v0, 0xc3000000, v0 +; GFX78-NEXT: v_exp_f32_e32 v0, v0 +; GFX78-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_powi_neg128_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 0xc3000000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %res = call float @llvm.powi.f32.i32(float %l, i32 -128) ret float %res } @@ -171,3 +302,5 @@ declare double @llvm.powi.f64.i32(double, i32) #0 attributes #0 = { nounwind readnone speculatable willreturn } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,13 +1,82 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}fptrunc_f32_to_f16: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fptrunc_f32_to_f16( +; SI-LABEL: fptrunc_f32_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_f32_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_f32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_f32_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -17,13 +86,84 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_f64_to_f16: -; GCN: buffer_load_dwordx2 v[[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]] -; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v[[[A_F64_0]]:[[A_F64_1]]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fptrunc_f64_to_f16( +; SI-LABEL: fptrunc_f64_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_f64_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_f64_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_f64_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -33,23 +173,89 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16: -; GCN: buffer_load_dwordx2 v[[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]] -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] - -; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm - define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( +; SI-LABEL: fptrunc_v2f32_to_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_v2f32_to_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_v2f32_to_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_v2f32_to_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -59,24 +265,99 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16: -; GCN: buffer_load_dwordx4 v[[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]] -; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v[[[A_F64_0]]:{{[0-9]+}}] -; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v[{{[0-9]+}}:[[A_F64_3]]] -; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] -; -; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]] - -; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_0]] - -; GCN: buffer_store_dword v[[R_V2_F16]] - define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( +; SI-LABEL: fptrunc_v2f64_to_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; SI-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_v2f64_to_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; VI-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; VI-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_v2f64_to_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -86,12 +367,79 @@ ret void } -; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( +; SI-LABEL: fneg_fptrunc_f32_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fptrunc_f32_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fptrunc_f32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fptrunc_f32_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -102,12 +450,79 @@ ret void } -; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( +; SI-LABEL: fabs_fptrunc_f32_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fabs_fptrunc_f32_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fabs_fptrunc_f32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fabs_fptrunc_f32_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -118,12 +533,79 @@ ret void } -; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]| -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( +; SI-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, -|v0| +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e64 v0, -|v0| +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e64 v0, -|v0| +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e64 v0, -|v0| +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -135,13 +617,81 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] -; SIVI-NOT: v[[R_F16]] -; GFX9-NOT: v_and_b32 -; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( +; SI-LABEL: fptrunc_f32_to_f16_zext_i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_f32_to_f16_zext_i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -153,13 +703,81 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]| -; SIVI-NOT: v[[R_F16]] -; GFX9-NOT: v_and_b32 -; GCN: buffer_store_dword v[[R_F16]] define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( +; SI-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -172,12 +790,84 @@ ret void } -; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32: -; GCN: buffer_load_dword v[[A_F32:[0-9]+]] -; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]] -; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16 -; GCN: buffer_store_dword v[[R_F16_SEXT]] define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( +; SI-LABEL: fptrunc_f32_to_f16_sext_i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_f32_to_f16_sext_i32: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -193,3 +883,5 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -1,67 +1,541 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-UNSAFE %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-UNSAFE %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SAFE %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-UNSAFE %s -; GCN-LABEL: {{^}}fptrunc_f64_to_f32: -; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { +; SI-LABEL: fptrunc_f64_to_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_f64_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; VI-UNSAFE-LABEL: fptrunc_f64_to_f32: +; VI-UNSAFE: ; %bb.0: +; VI-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 +; VI-UNSAFE-NEXT: s_mov_b32 s6, -1 +; VI-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-UNSAFE-NEXT: s_mov_b32 s4, s0 +; VI-UNSAFE-NEXT: s_mov_b32 s5, s1 +; VI-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-UNSAFE-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_f64_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = fptrunc double %in to float store float %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fptrunc_f64_to_f16: -; GCN-NOT: v_cvt -; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]] -; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]] define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) { +; SI-LABEL: fptrunc_f64_to_f16: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_lshr_b32 s4, s7, 8 +; SI-NEXT: s_and_b32 s5, s7, 0x1ff +; SI-NEXT: s_and_b32 s8, s4, 0xffe +; SI-NEXT: s_or_b32 s4, s5, s6 +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: s_bfe_u32 s4, s7, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_sub_i32 s6, 0x3f1, s4 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc10 +; SI-NEXT: s_or_b32 s11, s8, s5 +; SI-NEXT: v_med3_i32 v0, s6, 0, 13 +; SI-NEXT: s_lshl_b32 s4, s10, 12 +; SI-NEXT: s_or_b32 s5, s11, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: s_or_b32 s4, s11, s4 +; SI-NEXT: s_lshr_b32 s6, s5, s6 +; SI-NEXT: v_lshl_b32_e32 v0, s6, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, s5, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_readfirstlane_b32 s5, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_cmp_lt_i32 s10, 1 +; SI-NEXT: s_cselect_b32 s6, s5, s4 +; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_cmp_lg_u32 s4, 0 +; SI-NEXT: s_addc_u32 s4, s6, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 31 +; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s11, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; SI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_lshr_b32 s4, s7, 16 +; SI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SI-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_f64_to_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_lshr_b32 s4, s7, 8 +; VI-NEXT: s_and_b32 s8, s4, 0xffe +; VI-NEXT: s_and_b32 s4, s7, 0x1ff +; VI-NEXT: s_or_b32 s4, s4, s6 +; VI-NEXT: s_cmp_lg_u32 s4, 0 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: s_bfe_u32 s5, s7, 0xb0014 +; VI-NEXT: s_or_b32 s6, s8, s4 +; VI-NEXT: s_sub_i32 s8, 0x3f1, s5 +; VI-NEXT: v_med3_i32 v0, s8, 0, 13 +; VI-NEXT: s_or_b32 s4, s6, 0x1000 +; VI-NEXT: v_readfirstlane_b32 s8, v0 +; VI-NEXT: s_lshr_b32 s8, s4, s8 +; VI-NEXT: v_lshlrev_b32_e64 v0, v0, s8 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: s_add_i32 s10, s5, 0xfffffc10 +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: s_lshl_b32 s5, s10, 12 +; VI-NEXT: s_or_b32 s4, s8, s4 +; VI-NEXT: s_or_b32 s5, s6, s5 +; VI-NEXT: s_cmp_lt_i32 s10, 1 +; VI-NEXT: s_cselect_b32 s11, s4, s5 +; VI-NEXT: s_and_b32 s8, s11, 7 +; VI-NEXT: s_cmp_gt_i32 s8, 5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: s_cmp_eq_u32 s8, 3 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; VI-NEXT: s_lshr_b32 s8, s11, 2 +; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 +; VI-NEXT: s_addc_u32 s4, s8, 0 +; VI-NEXT: s_cmp_lt_i32 s10, 31 +; VI-NEXT: s_cselect_b32 s8, s4, 0x7c00 +; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; VI-NEXT: s_cmpk_eq_i32 s10, 0x40f +; VI-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; VI-NEXT: v_mov_b32_e32 v1, s8 +; VI-NEXT: s_cselect_b64 vcc, -1, 0 +; VI-NEXT: s_lshr_b32 s4, s7, 16 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; VI-NEXT: s_and_b32 s4, s4, 0x8000 +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; VI-UNSAFE-LABEL: fptrunc_f64_to_f16: +; VI-UNSAFE: ; %bb.0: +; VI-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; VI-UNSAFE-NEXT: s_mov_b32 s2, -1 +; VI-UNSAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-UNSAFE-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-UNSAFE-NEXT: s_endpgm +; +; GFX11-SAFE-LABEL: fptrunc_f64_to_f16: +; GFX11-SAFE: ; %bb.0: +; GFX11-SAFE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SAFE-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SAFE-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX11-SAFE-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-SAFE-NEXT: s_or_b32 s2, s4, s2 +; GFX11-SAFE-NEXT: s_and_b32 s4, s5, 0xffe +; GFX11-SAFE-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SAFE-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-SAFE-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX11-SAFE-NEXT: s_sub_i32 s5, 0x3f1, s2 +; GFX11-SAFE-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-SAFE-NEXT: v_med3_i32 v1, s5, 0, 13 +; GFX11-SAFE-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SAFE-NEXT: s_lshl_b32 s7, s2, 12 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-SAFE-NEXT: s_or_b32 s4, s4, s5 +; GFX11-SAFE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SAFE-NEXT: s_or_b32 s7, s4, s7 +; GFX11-SAFE-NEXT: s_lshr_b32 s6, s5, s6 +; GFX11-SAFE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_lshlrev_b32_e64 v0, v1, s6 +; GFX11-SAFE-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SAFE-NEXT: s_or_b32 s5, s6, s5 +; GFX11-SAFE-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-SAFE-NEXT: s_cselect_b32 s5, s5, s7 +; GFX11-SAFE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-NEXT: s_and_b32 s6, s5, 7 +; GFX11-SAFE-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-SAFE-NEXT: s_cselect_b32 s7, -1, 0 +; GFX11-SAFE-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-SAFE-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-SAFE-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SAFE-NEXT: s_or_b32 s6, s6, s7 +; GFX11-SAFE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SAFE-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-SAFE-NEXT: s_addc_u32 s5, s5, 0 +; GFX11-SAFE-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-SAFE-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SAFE-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SAFE-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-SAFE-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-SAFE-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-SAFE-NEXT: s_lshr_b32 s2, s3, 16 +; GFX11-SAFE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SAFE-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SAFE-NEXT: v_lshlrev_b32_e32 v0, 9, v0 +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_or_b32_e32 v0, 0x7c00, v0 +; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX11-SAFE-NEXT: s_mov_b32 s2, -1 +; GFX11-SAFE-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-SAFE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SAFE-NEXT: s_endpgm +; +; GFX11-UNSAFE-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE: ; %bb.0: +; GFX11-UNSAFE-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-UNSAFE-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-UNSAFE-NEXT: s_endpgm %result = fptrunc double %in to half %result_i16 = bitcast half %result to i16 store i16 %result_i16, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f32: -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x double> %in) { +; SI-LABEL: fptrunc_v2f64_to_v2f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_v2f64_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; VI-UNSAFE-LABEL: fptrunc_v2f64_to_v2f32: +; VI-UNSAFE: ; %bb.0: +; VI-UNSAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; VI-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; VI-UNSAFE-NEXT: s_mov_b32 s2, -1 +; VI-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-UNSAFE-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_v2f64_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = fptrunc <2 x double> %in to <2 x float> store <2 x float> %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fptrunc_v3f64_to_v3f32: -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) { +; SI-LABEL: fptrunc_v3f64_to_v3f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x15 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_v3f64_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; VI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; VI-UNSAFE-LABEL: fptrunc_v3f64_to_v3f32: +; VI-UNSAFE: ; %bb.0: +; VI-UNSAFE-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54 +; VI-UNSAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; VI-UNSAFE-NEXT: s_mov_b32 s2, -1 +; VI-UNSAFE-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-UNSAFE-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_v3f64_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] +; GFX11-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = fptrunc <3 x double> %in to <3 x float> store <3 x float> %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fptrunc_v4f64_to_v4f32: -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) { +; SI-LABEL: fptrunc_v4f64_to_v4f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_v4f64_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; VI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; VI-UNSAFE-LABEL: fptrunc_v4f64_to_v4f32: +; VI-UNSAFE: ; %bb.0: +; VI-UNSAFE-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; VI-UNSAFE-NEXT: s_mov_b32 s2, -1 +; VI-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-UNSAFE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-UNSAFE-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_v4f64_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; GFX11-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; GFX11-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = fptrunc <4 x double> %in to <4 x float> store <4 x float> %result, ptr addrspace(1) %out ret void } -; GCN-LABEL: {{^}}fptrunc_v8f64_to_v8f32: -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 -; GCN: v_cvt_f32_f64_e32 define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) { +; SI-LABEL: fptrunc_v8f64_to_v8f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; SI-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] +; SI-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] +; SI-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fptrunc_v8f64_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; VI-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] +; VI-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] +; VI-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] +; VI-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; VI-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; VI-UNSAFE-LABEL: fptrunc_v8f64_to_v8f32: +; VI-UNSAFE: ; %bb.0: +; VI-UNSAFE-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64 +; VI-UNSAFE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-UNSAFE-NEXT: s_mov_b32 s3, 0xf000 +; VI-UNSAFE-NEXT: s_mov_b32 s2, -1 +; VI-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; VI-UNSAFE-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; VI-UNSAFE-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-UNSAFE-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-UNSAFE-NEXT: s_endpgm +; +; GFX11-LABEL: fptrunc_v8f64_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f64_e32 v7, s[18:19] +; GFX11-NEXT: v_cvt_f32_f64_e32 v6, s[16:17] +; GFX11-NEXT: v_cvt_f32_f64_e32 v5, s[14:15] +; GFX11-NEXT: v_cvt_f32_f64_e32 v4, s[12:13] +; GFX11-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] +; GFX11-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] +; GFX11-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX11-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = fptrunc <8 x double> %in to <8 x float> store <8 x float> %result, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -1,18 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX89,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX89,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}fsub_f16: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fsub_f16( +; SI-LABEL: fsub_f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, v0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fsub_f16: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_mov_b32 s14, s2 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s12, s6 +; GFX89-NEXT: s_mov_b32 s13, s7 +; GFX89-NEXT: s_mov_b32 s15, s3 +; GFX89-NEXT: s_mov_b32 s10, s2 +; GFX89-NEXT: s_mov_b32 s11, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fsub_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -24,15 +89,63 @@ ret void } -; GCN-LABEL: {{^}}fsub_f16_imm_a: -; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] -; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fsub_f16_imm_a( +; SI-LABEL: fsub_f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fsub_f16_imm_a: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fsub_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -42,15 +155,63 @@ ret void } -; GCN-LABEL: {{^}}fsub_f16_imm_b: -; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] -; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] -; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]] -; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; GFX89: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] -; GCN: buffer_store_short v[[R_F16]] -; GCN: s_endpgm define amdgpu_kernel void @fsub_f16_imm_b( +; SI-LABEL: fsub_f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX89-LABEL: fsub_f16_imm_b: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s10, s6 +; GFX89-NEXT: s_mov_b32 s11, s7 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_mov_b32 s4, s0 +; GFX89-NEXT: s_mov_b32 s5, s1 +; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_endpgm +; +; GFX11-LABEL: fsub_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -60,41 +221,110 @@ ret void } -; GCN-LABEL: {{^}}fsub_v2f16: -; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]] -; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] -; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - - -; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]] - -; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm - define amdgpu_kernel void @fsub_v2f16( +; SI-LABEL: fsub_v2f16: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fsub_v2f16: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fsub_v2f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_mov_b32 s14, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b32 s15, s3 +; GFX9-NEXT: s_mov_b32 s10, s2 +; GFX9-NEXT: s_mov_b32 s11, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fsub_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -106,31 +336,91 @@ ret void } -; GCN-LABEL: {{^}}fsub_v2f16_imm_a: -; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] -; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 -; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0] - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm - define amdgpu_kernel void @fsub_v2f16_imm_a( +; SI-LABEL: fsub_v2f16_imm_a: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fsub_v2f16_imm_a: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, 0x4000 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fsub_v2f16_imm_a: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x40003c00 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fsub_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -140,31 +430,91 @@ ret void } -; GCN-LABEL: {{^}}fsub_v2f16_imm_b: -; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] - -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] -; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]] -; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 -; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONSTM1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]] - -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 -; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}} - -; GCN: buffer_store_dword v[[R_V2_F16]] -; GCN: s_endpgm - define amdgpu_kernel void @fsub_v2f16_imm_b( +; SI-LABEL: fsub_v2f16_imm_b: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v1, -1.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v0, -2.0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: fsub_v2f16_imm_b: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: v_mov_b32_e32 v1, 0xbc00 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, -2.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: fsub_v2f16_imm_b: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s10, s6 +; GFX9-NEXT: s_mov_b32 s11, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: fsub_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -173,3 +523,5 @@ store <2 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1,44 +1,148 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIGFX89,CI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIGFX89,GFX89,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIGFX89,GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s -; GCN-LABEL: {{^}}void_func_i1: -; GCN: v_and_b32_e32 v0, 1, v0 -; GCN: buffer_store_byte v0, off define void @void_func_i1(i1 %arg0) #0 { +; CIGFX89-LABEL: void_func_i1: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i1 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i1_zeroext: -; GCN: s_waitcnt -; GCN-NEXT: v_or_b32_e32 v0, 12, v0 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 { +; CIGFX89-LABEL: void_func_i1_zeroext: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_or_b32_e32 v0, 12, v0 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i1_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_or_b32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = zext i1 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i1_signext: -; GCN: s_waitcnt -; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_i1_signext(i1 signext %arg0) #0 { +; CI-LABEL: void_func_i1_signext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i1_signext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i1_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i1_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = sext i1 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}i1_arg_i1_use: -; GCN: v_and_b32_e32 v0, 1, v0 -; GCN: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1 define void @i1_arg_i1_use(i1 %arg) #0 { +; CIGFX89-LABEL: i1_arg_i1_use: +; CIGFX89: ; %bb.0: ; %bb +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 +; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CIGFX89-NEXT: s_xor_b64 s[6:7], vcc, -1 +; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; CIGFX89-NEXT: s_cbranch_execz .LBB3_2 +; CIGFX89-NEXT: ; %bb.1: ; %bb1 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: v_mov_b32_e32 v0, 0 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: .LBB3_2: ; %bb2 +; CIGFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_arg_i1_use: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_xor_b32 s1, vcc_lo, -1 +; GFX11-NEXT: s_and_saveexec_b32 s0, s1 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %bb1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB3_2: ; %bb2 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: br i1 %arg, label %bb2, label %bb1 @@ -50,304 +154,1139 @@ ret void } -; GCN-LABEL: {{^}}void_func_i8: -; GCN-NOT: v0 -; GCN: buffer_store_byte v0, off define void @void_func_i8(i8 %arg0) #0 { +; CIGFX89-LABEL: void_func_i8: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i8 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i8_zeroext: -; GCN-NOT: and_b32 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 { +; CI-LABEL: void_func_i8_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i8_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i8_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i8_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = zext i8 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i8_signext: -; GCN-NOT: v_bfe_i32 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i8_signext(i8 signext %arg0) #0 { +; CI-LABEL: void_func_i8_signext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i8_signext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i8_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i8_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = sext i8 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i16: -; GCN: buffer_store_short v0, off define void @void_func_i16(i16 %arg0) #0 { +; CIGFX89-LABEL: void_func_i16: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i16 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i16_zeroext: -; GCN-NOT: v0 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 { +; CI-LABEL: void_func_i16_zeroext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i16_zeroext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i16_zeroext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i16_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = zext i16 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i16_signext: -; GCN-NOT: v0 -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0 define void @void_func_i16_signext(i16 signext %arg0) #0 { +; CI-LABEL: void_func_i16_signext: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_i16_signext: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 12, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_i16_signext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 12, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i16_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 12, v0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ext = sext i16 %arg0 to i32 %add = add i32 %ext, 12 store i32 %add, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i32: -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_i32(i32 %arg0) #0 { +; CIGFX89-LABEL: void_func_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i32 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_i64: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_i64(i64 %arg0) #0 { +; CIGFX89-LABEL: void_func_i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store i64 %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_f16: -; VI-NOT: v0 -; CI: v_cvt_f16_f32_e32 v0, v0 -; GCN: buffer_store_short v0, off define void @void_func_f16(half %arg0) #0 { +; CI-LABEL: void_func_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store half %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_f32 -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_f32(float %arg0) #0 { +; CIGFX89-LABEL: void_func_f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store float %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_f64: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_f64(double %arg0) #0 { +; CIGFX89-LABEL: void_func_f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store double %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i32: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_v2i32(<2 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3i32: -; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3i32(<3 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4i32: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v4i32(<4 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v5i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dword v4, off define void @void_func_v5i32(<5 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v5i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <5 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v8i32(<8 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v16i32(<16 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off define void @void_func_v32i32(<32 x i32> %arg0) #0 { +; CIGFX89-LABEL: void_func_v32i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(6) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <32 x i32> %arg0, ptr addrspace(1) undef ret void } ; 1 over register limit -; GCN-LABEL: {{^}}void_func_v33i32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN: buffer_store_dword [[STACKLOAD]], off define void @void_func_v33i32(<33 x i32> %arg0) #0 { +; CI-LABEL: void_func_v33i32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v33i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v33i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v33i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b32 v32, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <33 x i32> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i64: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v2i64(<2 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx2 v[4:5], off define void @void_func_v3i64(<3 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v4i64(<4 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v5i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx2 v[8:9], off define void @void_func_v5i64(<5 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v5i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[8:9], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <5 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v8i64(<8 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off define void @void_func_v16i64(<16 x i64> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(6) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i64> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i16: -; GFX9-NOT: v0 -; GFX9: buffer_store_dword v0, off define void @void_func_v2i16(<2 x i16> %arg0) #0 { +; CI-LABEL: void_func_v2i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v2i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3i16: -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off define void @void_func_v3i16(<3 x i16> %arg0) #0 { +; CI-LABEL: void_func_v3i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v3i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4i16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: buffer_store_dwordx2 v[0:1], off define void @void_func_v4i16(<4 x i16> %arg0) #0 { +; CI-LABEL: void_func_v4i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_or_b32_e32 v1, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v4i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v5i16: -; CI: v_lshlrev_b32 -; CI: v_and_b32 -; CI: v_lshlrev_b32 -; CI: v_or_b32 -; CI: v_or_b32 -; CI-DAG: buffer_store_short v -; CI-DAG: buffer_store_dwordx2 v - -; GFX89-DAG: buffer_store_short v2, off, -; GFX89-DAG: buffer_store_dwordx2 v[0:1], off - define void @void_func_v5i16(<5 x i16> %arg0) #0 { +; CI-LABEL: void_func_v5i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_or_b32_e32 v1, v0, v1 +; CI-NEXT: buffer_store_short v4, off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v5i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v2, off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v5i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v2, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <5 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8i16: -; GFX9-DAG: buffer_store_dwordx4 v[0:3], off define void @void_func_v8i16(<8 x i16> %arg0) #0 { +; CI-LABEL: void_func_v8i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: v_or_b32_e32 v5, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v2, v3 +; CI-NEXT: v_or_b32_e32 v3, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v8i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16i16: -; GFX9-DAG: buffer_store_dwordx4 v[0:3], off -; GFX9-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v16i16(<16 x i16> %arg0) #0 { +; CI-LABEL: void_func_v16i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v5, v4, v5 +; CI-NEXT: v_or_b32_e32 v4, v2, v3 +; CI-NEXT: v_or_b32_e32 v3, v0, v1 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; CI-NEXT: v_or_b32_e32 v14, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; CI-NEXT: v_or_b32_e32 v13, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; CI-NEXT: v_or_b32_e32 v12, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; CI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; CI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; CI-NEXT: v_or_b32_e32 v11, v1, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v6, v6, v7 +; CI-NEXT: buffer_store_dwordx4 v[11:14], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v16i16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x i16> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2i24: -; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1 define void @void_func_v2i24(<2 x i24> %arg0) #0 { +; CI-LABEL: void_func_v2i24: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v2i24: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %elt0 = extractelement <2 x i24> %arg0, i32 0 %elt1 = extractelement <2 x i24> %arg0, i32 1 %add = add i24 %elt0, %elt1 @@ -355,197 +1294,734 @@ ret void } -; GCN-LABEL: {{^}}void_func_v2f32: -; GCN-NOT: v[0:1] -; GCN-NOT: v0 -; GCN-NOT: v1 -; GCN: buffer_store_dwordx2 v[0:1], off define void @void_func_v2f32(<2 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3f32: -; GCN-DAG: buffer_store_dwordx3 v[0:2], off define void @void_func_v3f32(<3 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4f32: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v4f32(<4 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8f32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v8f32(<8 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16f32: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v16f32(<16 x float> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16f32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x float> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2f64: -; GCN: buffer_store_dwordx4 v[0:3], off define void @void_func_v2f64(<2 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v2f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v3f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx2 v[4:5], off define void @void_func_v3f64(<3 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v3f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v4f64(<4 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v4f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off define void @void_func_v8f64(<8 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v8f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16f64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off define void @void_func_v16f64(<16 x double> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16f64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(6) +; CIGFX89-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x double> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v2f16: -; GFX9-NOT: v0 -; GFX9: buffer_store_dword v0, off define void @void_func_v2f16(<2 x half> %arg0) #0 { +; CI-LABEL: void_func_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v2f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <2 x half> %arg0, ptr addrspace(1) undef ret void } ; FIXME: Different abi if f16 legal -; GCN-LABEL: {{^}}void_func_v3f16: -; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v0 -; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v1 -; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v2 - -; GFX89-DAG: v0 -; GFX89-DAG: v1 - -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_dword define void @void_func_v3f16(<3 x half> %arg0) #0 { +; CI-LABEL: void_func_v3f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_or_b32_e32 v0, v0, v1 +; CI-NEXT: buffer_store_short v2, off, s[4:7], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v3f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_short v1, off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x half> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v4f16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9-NOT: v[0:1] -; GFX9: buffer_store_dwordx2 v[0:1], off define void @void_func_v4f16(<4 x half> %arg0) #0 { +; CI-LABEL: void_func_v4f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_or_b32_e32 v1, v2, v1 +; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v4f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x half> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v8f16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: buffer_store_dwordx4 v[0:3], off define void @void_func_v8f16(<8 x half> %arg0) #0 { +; CI-LABEL: void_func_v8f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_or_b32_e32 v3, v2, v3 +; CI-NEXT: v_or_b32_e32 v2, v0, v1 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v8f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <8 x half> %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v16f16: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9-DAG: buffer_store_dwordx4 v[0:3], off -; GFX9-DAG: buffer_store_dwordx4 v[4:7], off define void @void_func_v16f16(<16 x half> %arg0) #0 { +; CI-LABEL: void_func_v16f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_or_b32_e32 v5, v6, v5 +; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; CI-NEXT: v_or_b32_e32 v3, v2, v3 +; CI-NEXT: v_or_b32_e32 v2, v0, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v15 +; CI-NEXT: v_or_b32_e32 v4, v4, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v13, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; CI-NEXT: v_or_b32_e32 v12, v7, v0 +; CI-NEXT: v_cvt_f16_f32_e32 v0, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v8 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_or_b32_e32 v11, v1, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; CI-NEXT: v_or_b32_e32 v10, v7, v0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: void_func_v16f16: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store <16 x half> %arg0, ptr addrspace(1) undef ret void } ; Make sure there is no alignment requirement for passed vgprs. -; GCN-LABEL: {{^}}void_func_i32_i64_i32: -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off -; GCN: buffer_store_dwordx2 v[1:2] -; GCN: buffer_store_dword v3 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 { +; CIGFX89-LABEL: void_func_i32_i64_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_dword v3, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_i32_i64_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile i32 %arg0, ptr addrspace(1) undef store volatile i64 %arg1, ptr addrspace(1) undef store volatile i32 %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_struct_i32: -; GCN-NOT: v0 -; GCN: buffer_store_dword v0, off define void @void_func_struct_i32({ i32 } %arg0) #0 { +; CIGFX89-LABEL: void_func_struct_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_struct_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store { i32 } %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_struct_i8_i32: -; GCN-DAG: buffer_store_byte v0, off -; GCN-DAG: buffer_store_dword v1, off define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 { +; CIGFX89-LABEL: void_func_struct_i8_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store { i8, i32 } %arg0, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: -; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_store_dword v[[ELT1]] -; GCN-DAG: buffer_store_byte v[[ELT0]] define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 { +; CIGFX89-LABEL: void_func_byval_struct_i8_i32: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; CIGFX89-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(1) +; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(1) +; CIGFX89-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u8 v1, off, s32 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0 store { i8, i32 } %arg0.load, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: -; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}} -; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}} -; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}} -; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}} - -; GCN: ds_write_b32 v0, v0 -; GCN: s_setpc_b64 define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }) %arg0, ptr addrspace(5) byval({ i8, i32 }) %arg1, i32 %arg2) #0 { +; CI-LABEL: void_func_byval_struct_i8_i32_x2: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_byval_struct_i8_i32_x2: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ds_write_b32 v0, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_byval_struct_i8_i32_x2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v1, off, s[0:3], s32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_byval_struct_i8_i32_x2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_u8 v1, off, s32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_b32 v2, off, s32 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_u8 v3, off, s32 offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_b32 v4, off, s32 offset:12 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v3, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0 %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1 store volatile { i8, i32 } %arg0.load, ptr addrspace(1) undef @@ -554,13 +2030,37 @@ ret void } -; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: -; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off -; GCN-DAG: buffer_store_dwordx2 v[[[ARG1_LOAD0]]:[[ARG1_LOAD1]]], off define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, ptr addrspace(5) byval(i64) %arg1) #0 { +; CIGFX89-LABEL: void_func_byval_i32_byval_i64: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: buffer_load_dword v2, off, s[0:3], s32 +; CIGFX89-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; CIGFX89-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: s_waitcnt vmcnt(2) +; CIGFX89-NEXT: buffer_store_dword v2, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(1) +; CIGFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_byval_i32_byval_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v2, off, s32 +; GFX11-NEXT: scratch_load_b64 v[0:1], off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.load = load i32, ptr addrspace(5) %arg0 %arg1.load = load i64, ptr addrspace(5) %arg1 store i32 %arg0.load, ptr addrspace(1) undef @@ -568,23 +2068,139 @@ ret void } -; GCN-LABEL: {{^}}void_func_v32i32_i32_i64: -; GCN-DAG: buffer_store_dwordx4 v[0:3], off -; GCN-DAG: buffer_store_dwordx4 v[4:7], off -; GCN-DAG: buffer_store_dwordx4 v[8:11], off -; GCN-DAG: buffer_store_dwordx4 v[12:15], off -; GCN-DAG: buffer_store_dwordx4 v[16:19], off -; GCN-DAG: buffer_store_dwordx4 v[20:23], off -; GCN-DAG: buffer_store_dwordx4 v[24:27], off -; GCN-DAG: buffer_store_dwordx4 v[28:31], off -; GCN-DAG: buffer_load_dword v[[LOAD_ARG0_31:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12 - -; GCN: buffer_store_dword v[[LOAD_ARG1]] -; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { +; CI-LABEL: void_func_v32i32_i32_i64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_i32_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_i32_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_i32_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile i32 %arg1, ptr addrspace(1) undef store volatile i64 %arg2, ptr addrspace(1) undef @@ -592,26 +2208,167 @@ } ; FIXME: Different ext load types on CI vs. VI -; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: -; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] -; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] - -; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off -; GCN: buffer_store_byte [[LOAD_ARG2]], off -; GCN: buffer_store_short [[LOAD_ARG3]], off -; GFX89: buffer_store_short [[LOAD_ARG4]], off - -; CI: buffer_store_short [[CVT_ARG4]], off define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 { +; CI-LABEL: void_func_v32i32_i1_i8_i16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; CI-NEXT: v_and_b32_e32 v0, 1, v16 +; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_i1_i8_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v0, 1, v20 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_i1_i8_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_i1_i8_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:16 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_and_b32_e32 v16, 1, v32 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile i1 %arg1, ptr addrspace(1) undef store volatile i8 %arg2, ptr addrspace(1) undef @@ -620,138 +2377,1136 @@ ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN: buffer_store_dwordx2 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]], off -; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v2i32_v2f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i32_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i32_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx2 v[16:17], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx2 v[18:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i32_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b64 v[32:33], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b64 v[34:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i32> %arg1, ptr addrspace(1) undef store volatile <2 x float> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16: -; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GFX9: buffer_store_dword [[LOAD_ARG1]], off -; GFX9: buffer_store_short [[LOAD_ARG2]], off define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v2i16_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_short v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i16_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i16_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i16_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b32 v32, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b32 v33, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i16> %arg1, ptr addrspace(1) undef store volatile <2 x half> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v2i64_v2f64: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v2i64_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v2i64_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <2 x i64> %arg1, ptr addrspace(1) undef store volatile <2 x double> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v4i32_v4f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v4i32_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v4i32_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v4i32_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <4 x i32> %arg1, ptr addrspace(1) undef store volatile <4 x float> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}} - -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]], off -; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v8i32_v8f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v8i32_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v8i32_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v8i32_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x10 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:36 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <8 x i32> %arg1, ptr addrspace(1) undef store volatile <8 x float> %arg2, ptr addrspace(1) undef ret void } -; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:12{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:32{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:36{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:40{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:44{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:48{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:52{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:56{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:60{{$}} - -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:64{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:68{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:72{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:76{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:80{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:84{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:88{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:92{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:96{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:100{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:104{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:108{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:112{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:116{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}} -; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}} define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { +; CI-LABEL: void_func_v32i32_v16i32_v16f32: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; CI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v16i32_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v16i32_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:52 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:20 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v16i32_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x20 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:124 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:68 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: buffer_store_b128 v[84:87], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: buffer_store_b128 v[80:83], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: buffer_store_b128 v[68:71], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b128 v[64:67], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b128 v[52:55], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b128 v[48:51], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b128 v[36:39], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[32:35], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <16 x i32> %arg1, ptr addrspace(1) undef store volatile <16 x float> %arg2, ptr addrspace(1) undef @@ -759,15 +3514,49 @@ } ; Make sure v3 isn't a wasted register because of v3 types being promoted to v4 -; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg: -; GCN: s_waitcnt -; GCN: ds_write_b32 v{{[0-9]+}}, v0 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 { +; CI-LABEL: void_func_v3f32_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: ds_write_b32 v0, v2 +; CI-NEXT: ds_write_b32 v0, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v3f32_wasted_reg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_write_b32 v0, v0 +; VI-NEXT: ds_write_b32 v0, v1 +; VI-NEXT: ds_write_b32 v0, v2 +; VI-NEXT: ds_write_b32 v0, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v3f32_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: ds_write_b32 v0, v2 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3f32_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: ds_store_b32 v0, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.0 = extractelement <3 x float> %arg0, i32 0 %arg0.1 = extractelement <3 x float> %arg0, i32 1 %arg0.2 = extractelement <3 x float> %arg0, i32 2 @@ -778,15 +3567,49 @@ ret void } -; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg: -; GCN: s_waitcnt -; GCN: ds_write_b32 v{{[0-9]+}}, v0 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 -; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 { +; CI-LABEL: void_func_v3i32_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v1 +; CI-NEXT: ds_write_b32 v0, v2 +; CI-NEXT: ds_write_b32 v0, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v3i32_wasted_reg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_write_b32 v0, v0 +; VI-NEXT: ds_write_b32 v0, v1 +; VI-NEXT: ds_write_b32 v0, v2 +; VI-NEXT: ds_write_b32 v0, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v3i32_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: ds_write_b32 v0, v2 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v3i32_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: ds_store_b32 v0, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.0 = extractelement <3 x i32> %arg0, i32 0 %arg0.1 = extractelement <3 x i32> %arg0, i32 1 %arg0.2 = extractelement <3 x i32> %arg0, i32 2 @@ -798,18 +3621,409 @@ } ; Check there is no crash. -; GCN-LABEL: {{^}}void_func_v16i8: define void @void_func_v16i8(<16 x i8> %arg0) #0 { +; CIGFX89-LABEL: void_func_v16i8: +; CIGFX89: ; %bb.0: +; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: s_mov_b32 s7, 0xf000 +; CIGFX89-NEXT: s_mov_b32 s6, -1 +; CIGFX89-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v7, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v3, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v2, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; CIGFX89-NEXT: s_waitcnt vmcnt(0) +; CIGFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b8 v15, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v14, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v13, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v12, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v11, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v10, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v9, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v8, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v7, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v6, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v5, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v4, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v3, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v2, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v1, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <16 x i8> %arg0, ptr addrspace(1) undef ret void } ; Check there is no crash. -; GCN-LABEL: {{^}}void_func_v32i32_v16i8: define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { +; CI-LABEL: void_func_v32i32_v16i8: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:48 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:36 +; CI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:40 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:44 +; CI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:24 +; CI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; CI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; CI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: void_func_v32i32_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_v32i32_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ubyte v17, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ubyte v18, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ubyte v19, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:60 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v12, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ubyte v13, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ubyte v14, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ubyte v15, off, s[0:3], s32 offset:28 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ubyte v9, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ubyte v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ubyte v11, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ubyte v5, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ubyte v6, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v20, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v19, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v18, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v14, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v13, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v12, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v8, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v15, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v10, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v9, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v11, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v4, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v5, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_byte v6, off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_v32i32_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x10 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u8 v33, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u8 v34, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u8 v35, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u8 v36, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u8 v37, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u8 v38, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u8 v39, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u8 v48, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u8 v49, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u8 v50, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u8 v51, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u8 v52, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u8 v53, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u8 v54, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u8 v55, off, s32 offset:4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[20:23], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: buffer_store_b8 v32, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: buffer_store_b8 v35, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: buffer_store_b8 v36, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: buffer_store_b8 v37, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: buffer_store_b8 v38, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: buffer_store_b8 v39, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: buffer_store_b8 v48, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: buffer_store_b8 v49, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: buffer_store_b8 v50, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: buffer_store_b8 v51, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: buffer_store_b8 v52, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: buffer_store_b8 v53, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: buffer_store_b8 v54, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b8 v55, off, s[0:3], 0 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] store volatile <32 x i32> %arg0, ptr addrspace(1) undef store volatile <16 x i8> %arg1, ptr addrspace(1) undef ret void } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -1,464 +1,1345 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s - -; GCN-LABEL: {{^}}i1_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX789,CI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX789,GFX89,GFX8 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX789,GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s + define i1 @i1_func_void() #0 { +; GFX789-LABEL: i1_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) undef ret i1 %val } ; FIXME: Missing and? -; GCN-LABEL: {{^}}i1_zeroext_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i1 @i1_zeroext_func_void() #0 { +; GFX789-LABEL: i1_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) undef ret i1 %val } -; GCN-LABEL: {{^}}i1_signext_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}} -; GCN-NEXT: s_setpc_b64 define signext i1 @i1_signext_func_void() #0 { +; GFX789-LABEL: i1_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i1_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) undef ret i1 %val } -; GCN-LABEL: {{^}}i8_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i8 @i8_func_void() #0 { +; GFX789-LABEL: i8_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i8_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i8, ptr addrspace(1) undef ret i8 %val } -; GCN-LABEL: {{^}}i8_zeroext_func_void: -; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i8 @i8_zeroext_func_void() #0 { +; GFX789-LABEL: i8_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i8_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i8, ptr addrspace(1) undef ret i8 %val } -; GCN-LABEL: {{^}}i8_signext_func_void: -; GCN: buffer_load_sbyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define signext i8 @i8_signext_func_void() #0 { +; GFX789-LABEL: i8_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i8_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i8, ptr addrspace(1) undef ret i8 %val } -; GCN-LABEL: {{^}}i16_func_void: -; GCN: buffer_load_ushort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i16 @i16_func_void() #0 { +; GFX789-LABEL: i16_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) undef ret i16 %val } -; GCN-LABEL: {{^}}i16_zeroext_func_void: -; GCN: buffer_load_ushort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i16 @i16_zeroext_func_void() #0 { +; GFX789-LABEL: i16_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) undef ret i16 %val } -; GCN-LABEL: {{^}}i16_signext_func_void: -; GCN: buffer_load_sshort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define signext i16 @i16_signext_func_void() #0 { +; GFX789-LABEL: i16_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_sshort v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i16_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i16, ptr addrspace(1) undef ret i16 %val } -; GCN-LABEL: {{^}}i32_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i32 @i32_func_void() #0 { +; GFX789-LABEL: i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i32, ptr addrspace(1) undef ret i32 %val } -; GCN-LABEL: {{^}}i48_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: buffer_load_ushort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i48 @i48_func_void() #0 { +; GFX789-LABEL: i48_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i48_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i48, ptr addrspace(1) undef, align 8 ret i48 %val } -; GCN-LABEL: {{^}}i48_zeroext_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: buffer_load_ushort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define zeroext i48 @i48_zeroext_func_void() #0 { +; GFX789-LABEL: i48_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_ushort v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i48_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i48, ptr addrspace(1) undef, align 8 ret i48 %val } -; GCN-LABEL: {{^}}i48_signext_func_void: -; GCN: buffer_load_dword v0, off -; GCN-NEXT: buffer_load_sshort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define signext i48 @i48_signext_func_void() #0 { +; GFX789-LABEL: i48_signext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_sshort v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i48_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_i16 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i48, ptr addrspace(1) undef, align 8 ret i48 %val } -; GCN-LABEL: {{^}}i63_func_void: -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 define i63 @i63_func_void(i63 %val) #0 { +; GFX789-LABEL: i63_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i63_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] ret i63 %val } -; GCN-LABEL: {{^}}i63_zeroext_func_void: -; GCN: s_waitcnt -; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GCN-NEXT: s_setpc_b64 define zeroext i63 @i63_zeroext_func_void(i63 %val) #0 { +; GFX789-LABEL: i63_zeroext_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i63_zeroext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] ret i63 %val } -; GCN-LABEL: {{^}}i63_signext_func_void: -; GCN: s_waitcnt -; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; CI-NEXT: v_ashr_i64 v[0:1], v[0:1], 1 - -; GFX89-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX89-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] - -; GCN-NEXT: s_setpc_b64 define signext i63 @i63_signext_func_void(i63 %val) #0 { +; CI-LABEL: i63_signext_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; CI-NEXT: v_ashr_i64 v[0:1], v[0:1], 1 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: i63_signext_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX89-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i63_signext_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] ret i63 %val } -; GCN-LABEL: {{^}}i64_func_void: -; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i64 @i64_func_void() #0 { +; GFX789-LABEL: i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i64, ptr addrspace(1) undef ret i64 %val } -; GCN-LABEL: {{^}}i65_func_void: -; GCN-DAG: buffer_load_dwordx2 v[0:1], off -; GCN-DAG: buffer_load_ubyte v2, off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define i65 @i65_func_void() #0 { +; GFX789-LABEL: i65_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: i65_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_u8 v2, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i65, ptr addrspace(1) undef ret i65 %val } -; GCN-LABEL: {{^}}f32_func_void: -; GCN: buffer_load_dword v0, off, s[4:7], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define float @f32_func_void() #0 { +; GFX789-LABEL: f32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: f32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load float, ptr addrspace(1) undef ret float %val } -; GCN-LABEL: {{^}}f64_func_void: -; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define double @f64_func_void() #0 { +; GFX789-LABEL: f64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: f64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load double, ptr addrspace(1) undef ret double %val } -; GCN-LABEL: {{^}}v2f64_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <2 x double> @v2f64_func_void() #0 { +; GFX789-LABEL: v2f64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2f64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x double>, ptr addrspace(1) undef ret <2 x double> %val } -; GCN-LABEL: {{^}}v2i32_func_void: -; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <2 x i32> @v2i32_func_void() #0 { +; GFX789-LABEL: v2i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i32>, ptr addrspace(1) undef ret <2 x i32> %val } -; GCN-LABEL: {{^}}v3i32_func_void: -; GCN: buffer_load_dwordx3 v[0:2], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <3 x i32> @v3i32_func_void() #0 { +; GFX789-LABEL: v3i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx3 v[0:2], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b96 v[0:2], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i32>, ptr addrspace(1) undef ret <3 x i32> %val } -; GCN-LABEL: {{^}}v4i32_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <4 x i32> @v4i32_func_void() #0 { +; GFX789-LABEL: v4i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i32>, ptr addrspace(1) undef ret <4 x i32> %val } -; GCN-LABEL: {{^}}v5i32_func_void: -; GCN-DAG: buffer_load_dword v4, off -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <5 x i32> @v5i32_func_void() #0 { +; GFX789-LABEL: v5i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dword v4, off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v5i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v4, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load volatile <5 x i32>, ptr addrspace(1) undef ret <5 x i32> %val } -; GCN-LABEL: {{^}}v8i32_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <8 x i32> @v8i32_func_void() #0 { +; GFX789-LABEL: v8i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v8i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i32>, ptr addrspace(1) %ptr ret <8 x i32> %val } -; GCN-LABEL: {{^}}v16i32_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <16 x i32> @v16i32_func_void() #0 { +; GFX789-LABEL: v16i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i32>, ptr addrspace(1) %ptr ret <16 x i32> %val } -; GCN-LABEL: {{^}}v32i32_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <32 x i32> @v32i32_func_void() #0 { +; GFX789-LABEL: v32i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX789-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX789-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX789-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v32i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <32 x i32>, ptr addrspace(1) %ptr ret <32 x i32> %val } -; GCN-LABEL: {{^}}v2i64_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <2 x i64> @v2i64_func_void() #0 { +; GFX789-LABEL: v2i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i64>, ptr addrspace(1) undef ret <2 x i64> %val } -; GCN-LABEL: {{^}}v3i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx2 v[4:5], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <3 x i64> @v3i64_func_void() #0 { +; GFX789-LABEL: v3i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <3 x i64>, ptr addrspace(1) %ptr ret <3 x i64> %val } -; GCN-LABEL: {{^}}v4i64_func_void: -; GCN: buffer_load_dwordx4 v[0:3], off -; GCN: buffer_load_dwordx4 v[4:7], off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <4 x i64> @v4i64_func_void() #0 { +; GFX789-LABEL: v4i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <4 x i64>, ptr addrspace(1) %ptr ret <4 x i64> %val } -; GCN-LABEL: {{^}}v5i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx2 v[8:9], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <5 x i64> @v5i64_func_void() #0 { +; GFX789-LABEL: v5i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v5i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b64 v[8:9], off, s[0:3], 0 offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <5 x i64>, ptr addrspace(1) %ptr ret <5 x i64> %val } -; GCN-LABEL: {{^}}v8i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <8 x i64> @v8i64_func_void() #0 { +; GFX789-LABEL: v8i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v8i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i64>, ptr addrspace(1) %ptr ret <8 x i64> %val } -; GCN-LABEL: {{^}}v16i64_func_void: -; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN-DAG: buffer_load_dwordx4 v[16:19], off -; GCN-DAG: buffer_load_dwordx4 v[20:23], off -; GCN-DAG: buffer_load_dwordx4 v[24:27], off -; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define <16 x i64> @v16i64_func_void() #0 { +; GFX789-LABEL: v16i64_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX789-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; GFX789-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; GFX789-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64 +; GFX789-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80 +; GFX789-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96 +; GFX789-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i64_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i64>, ptr addrspace(1) %ptr ret <16 x i64> %val } -; GCN-LABEL: {{^}}v2i16_func_void: -; GFX9: buffer_load_dword v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <2 x i16> @v2i16_func_void() #0 { +; CI-LABEL: v2i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v2i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v2i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <2 x i16>, ptr addrspace(1) undef ret <2 x i16> %val } -; GCN-LABEL: {{^}}v3i16_func_void: -; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <3 x i16> @v3i16_func_void() #0 { +; CI-LABEL: v3i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; CI-NEXT: v_mov_b32_e32 v0, v2 +; CI-NEXT: v_mov_b32_e32 v2, v3 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v3i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i16>, ptr addrspace(1) undef ret <3 x i16> %val } -; GCN-LABEL: {{^}}v4i16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <4 x i16> @v4i16_func_void() #0 { +; CI-LABEL: v4i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_mov_b32_e32 v2, v1 +; CI-NEXT: v_mov_b32_e32 v1, v4 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v4i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <4 x i16>, ptr addrspace(1) undef ret <4 x i16> %val } -; GCN-LABEL: {{^}}v4f16_func_void: -; GFX9: buffer_load_dwordx2 v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <4 x half> @v4f16_func_void() #0 { +; CI-LABEL: v4f16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v4f16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4f16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load <4 x half>, ptr addrspace(1) undef ret <4 x half> %val } ; FIXME: Mixing buffer and global ; FIXME: Should not scalarize -; GCN-LABEL: {{^}}v5i16_func_void: -; GFX9: buffer_load_dwordx4 v[0:3] -; GFX9-NEXT: s_waitcnt -; GFX9-NEXT: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { +; CI-LABEL: v5i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; CI-NEXT: buffer_load_sshort v4, off, s[4:7], 0 offset:8 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; CI-NEXT: v_mov_b32_e32 v2, v1 +; CI-NEXT: v_mov_b32_e32 v1, v5 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v5i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v5i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <5 x i16>, ptr addrspace(1) %ptr ret <5 x i16> %val } -; GCN-LABEL: {{^}}v8i16_func_void: -; GFX9-DAG: buffer_load_dwordx4 v[0:3], off -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <8 x i16> @v8i16_func_void() #0 { +; CI-LABEL: v8i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v11 +; CI-NEXT: v_mov_b32_e32 v0, v8 +; CI-NEXT: v_mov_b32_e32 v2, v9 +; CI-NEXT: v_mov_b32_e32 v4, v10 +; CI-NEXT: v_mov_b32_e32 v6, v11 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v8i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v8i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <8 x i16>, ptr addrspace(1) %ptr ret <8 x i16> %val } -; GCN-LABEL: {{^}}v16i16_func_void: -; GFX9: buffer_load_dwordx4 v[0:3], off -; GFX9: buffer_load_dwordx4 v[4:7], off -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <16 x i16> @v16i16_func_void() #0 { +; CI-LABEL: v16i16_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dwordx4 v[22:25], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[18:21], off, s[4:7], 0 offset:16 +; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v24 +; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v18 +; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; CI-NEXT: v_lshrrev_b32_e32 v13, 16, v20 +; CI-NEXT: v_lshrrev_b32_e32 v15, 16, v21 +; CI-NEXT: v_mov_b32_e32 v0, v22 +; CI-NEXT: v_mov_b32_e32 v2, v23 +; CI-NEXT: v_mov_b32_e32 v4, v24 +; CI-NEXT: v_mov_b32_e32 v6, v25 +; CI-NEXT: v_mov_b32_e32 v8, v18 +; CI-NEXT: v_mov_b32_e32 v10, v19 +; CI-NEXT: v_mov_b32_e32 v12, v20 +; CI-NEXT: v_mov_b32_e32 v14, v21 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v16i16_func_void: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX89-NEXT: s_mov_b32 s7, 0xf000 +; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX89-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i16_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i16>, ptr addrspace(1) %ptr ret <16 x i16> %val } ; FIXME: Should pack -; GCN-LABEL: {{^}}v16i8_func_void: -; GCN-DAG: v12 -; GCN-DAG: v13 -; GCN-DAG: v14 -; GCN-DAG: v15 define <16 x i8> @v16i8_func_void() #0 { +; GFX789-LABEL: v16i8_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX789-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX789-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX789-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX789-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX789-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX789-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX789-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX789-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX789-NEXT: v_mov_b32_e32 v4, v1 +; GFX789-NEXT: v_mov_b32_e32 v8, v2 +; GFX789-NEXT: v_mov_b32_e32 v12, v3 +; GFX789-NEXT: v_mov_b32_e32 v1, v16 +; GFX789-NEXT: v_mov_b32_e32 v2, v17 +; GFX789-NEXT: v_mov_b32_e32 v3, v18 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v16i8_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16 +; GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18 +; GFX11-NEXT: v_mov_b32_e32 v2, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <16 x i8>, ptr addrspace(1) %ptr ret <16 x i8> %val } ; FIXME: Should pack -; GCN-LABEL: {{^}}v4i8_func_void: -; GCN: buffer_load_dword v0 -; GCN-DAG: v_lshrrev_b32_e32 v1, 8, v0 -; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0 -; GCN: s_setpc_b64 define <4 x i8> @v4i8_func_void() #0 { +; GFX789-LABEL: v4i8_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: s_waitcnt lgkmcnt(0) +; GFX789-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX789-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v4i8_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <4 x i8>, ptr addrspace(1) %ptr ret <4 x i8> %val } -; GCN-LABEL: {{^}}struct_i8_i32_func_void: -; GCN-DAG: buffer_load_dword v1 -; GCN-DAG: buffer_load_ubyte v0 -; GCN: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 define {i8, i32} @struct_i8_i32_func_void() #0 { +; GFX789-LABEL: struct_i8_i32_func_void: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX789-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: struct_i8_i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load { i8, i32 }, ptr addrspace(1) undef ret { i8, i32 } %val } -; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: -; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] -; GCN: buffer_load_dword [[VAL1:v[0-9]+]] -; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}} -; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}} define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %arg0) #0 { +; GFX789-LABEL: void_func_sret_struct_i8_i32: +; GFX789: ; %bb.0: +; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX789-NEXT: s_mov_b32 s7, 0xf000 +; GFX789-NEXT: s_mov_b32 s6, -1 +; GFX789-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; GFX789-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_sret_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_load_u8 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_b32 v2, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 v0, v1, off +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load volatile i8, ptr addrspace(1) undef %val1 = load volatile i32, ptr addrspace(1) undef %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0 @@ -471,140 +1352,939 @@ ; FIXME: Should be able to fold offsets in all of these pre-gfx9. Call ; lowering introduces an extra CopyToReg/CopyFromReg obscuring the ; AssertZext inserted. Not using it introduces the spills. - -; GCN-LABEL: {{^}}v33i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { +; CI-LABEL: v33i32_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; CI-NEXT: s_waitcnt vmcnt(11) +; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; CI-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; CI-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; CI-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v33i32_func_void: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX8-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX8-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x7c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 +; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GFX8-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v33i32_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v33i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[5:8], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0x70 +; GFX11-NEXT: s_add_i32 s2, s0, 0x60 +; GFX11-NEXT: s_add_i32 s3, s0, 0x50 +; GFX11-NEXT: s_add_i32 s4, s0, 64 +; GFX11-NEXT: s_add_i32 s5, s0, 48 +; GFX11-NEXT: s_add_i32 s6, s0, 32 +; GFX11-NEXT: s_add_i32 s7, s0, 16 +; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v33, s8 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load <33 x i32>, ptr addrspace(1) %ptr ret <33 x i32> %val } -; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { +; CI-LABEL: struct_v32i32_i32_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_i32_e32 v34, vcc, 0x80, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; CI-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; CI-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v33, vcc, 0x7c, v0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; CI-NEXT: s_waitcnt vmcnt(11) +; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; CI-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; CI-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; CI-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; CI-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: struct_v32i32_i32_func_void: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_add_u32_e32 v34, vcc, 0x80, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX8-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX8-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0x7c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x78, v0 +; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x74, v0 +; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x64, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x60, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x58, v0 +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x54, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x50, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x4c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 56, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x48, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x44, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 60, v0 +; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 52, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 48, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 44, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 40, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 36, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 32, v0 +; GFX8-NEXT: buffer_store_dword v21, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 28, v0 +; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 20, v0 +; GFX8-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 16, v0 +; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 12, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 8, v0 +; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GFX8-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: struct_v32i32_i32_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112 +; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 offset:128 +; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64 +; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48 +; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32 +; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: struct_v32i32_i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:112 +; GFX11-NEXT: buffer_load_b128 v[5:8], off, s[0:3], 0 offset:96 +; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80 +; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64 +; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 +; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 offset:128 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0x70 +; GFX11-NEXT: s_add_i32 s2, s0, 0x60 +; GFX11-NEXT: s_add_i32 s3, s0, 0x50 +; GFX11-NEXT: s_add_i32 s4, s0, 64 +; GFX11-NEXT: s_add_i32 s5, s0, 48 +; GFX11-NEXT: s_add_i32 s6, s0, 32 +; GFX11-NEXT: s_add_i32 s7, s0, 16 +; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v33, s8 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr ret { <32 x i32>, i32 }%val } -; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:132{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:136{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:140{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:144{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:148{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:152{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:156{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:160{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:164{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:168{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:172{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:176{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:180{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:184{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:188{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:192{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:196{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:200{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:204{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:208{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:212{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:216{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:220{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:224{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:228{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:232{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:236{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:240{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}} -; GFX9: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { +; CI-LABEL: struct_i32_v32i32_func_void: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 +; CI-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 +; CI-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 +; CI-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; CI-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 +; CI-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 +; CI-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 +; CI-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v33, vcc, 0xfc, v0 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xf8, v0 +; CI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xf4, v0 +; CI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xf0, v0 +; CI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xec, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xe8, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xe4, v0 +; CI-NEXT: s_waitcnt vmcnt(11) +; CI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xe0, v0 +; CI-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xdc, v0 +; CI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xd8, v0 +; CI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xd4, v0 +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xd0, v0 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xcc, v0 +; CI-NEXT: v_add_i32_e32 v6, vcc, 0xc8, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0xb8, v0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0xc4, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0xc0, v0 +; CI-NEXT: v_add_i32_e32 v3, vcc, 0xbc, v0 +; CI-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xb4, v0 +; CI-NEXT: v_add_i32_e32 v8, vcc, 0xb0, v0 +; CI-NEXT: v_add_i32_e32 v9, vcc, 0xac, v0 +; CI-NEXT: v_add_i32_e32 v10, vcc, 0xa8, v0 +; CI-NEXT: v_add_i32_e32 v11, vcc, 0xa4, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xa0, v0 +; CI-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x9c, v0 +; CI-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x98, v0 +; CI-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x94, v0 +; CI-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x90, v0 +; CI-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x8c, v0 +; CI-NEXT: s_waitcnt vmcnt(14) +; CI-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x88, v0 +; CI-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v1, vcc, 0x84, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0 +; CI-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; CI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: struct_i32_v32i32_func_void: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; GFX8-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 +; GFX8-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 +; GFX8-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 +; GFX8-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; GFX8-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 +; GFX8-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 +; GFX8-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 +; GFX8-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v33, vcc, 0xfc, v0 +; GFX8-NEXT: s_waitcnt vmcnt(8) +; GFX8-NEXT: buffer_store_dword v4, v33, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xf8, v0 +; GFX8-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xf4, v0 +; GFX8-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xf0, v0 +; GFX8-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xec, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xe8, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xe4, v0 +; GFX8-NEXT: s_waitcnt vmcnt(11) +; GFX8-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xe0, v0 +; GFX8-NEXT: buffer_store_dword v7, v2, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xdc, v0 +; GFX8-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xd8, v0 +; GFX8-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xd4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xd0, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xcc, v0 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xc8, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v12, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v11, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0xb8, v0 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xc4, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xc0, v0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xbc, v0 +; GFX8-NEXT: buffer_store_dword v9, v4, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xb4, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xb0, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xac, v0 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xa8, v0 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xa4, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v16, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0xa0, v0 +; GFX8-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v14, v7, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v13, v2, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v20, v3, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v19, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v18, v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v24, v9, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v23, v10, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v22, v11, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v21, v5, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x9c, v0 +; GFX8-NEXT: buffer_store_dword v28, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x98, v0 +; GFX8-NEXT: buffer_store_dword v27, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x94, v0 +; GFX8-NEXT: buffer_store_dword v26, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x90, v0 +; GFX8-NEXT: buffer_store_dword v25, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x8c, v0 +; GFX8-NEXT: s_waitcnt vmcnt(14) +; GFX8-NEXT: buffer_store_dword v32, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x88, v0 +; GFX8-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x84, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80, v0 +; GFX8-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GFX8-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: struct_i32_v32i32_func_void: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240 +; GFX9-NEXT: buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224 +; GFX9-NEXT: buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208 +; GFX9-NEXT: buffer_load_dword v33, off, s[4:7], 0 +; GFX9-NEXT: buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192 +; GFX9-NEXT: buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176 +; GFX9-NEXT: buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160 +; GFX9-NEXT: buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144 +; GFX9-NEXT: buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:252 +; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:248 +; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:244 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:240 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:236 +; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:232 +; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:228 +; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:224 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 +; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:204 +; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:200 +; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:196 +; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:192 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:188 +; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:184 +; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:180 +; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:176 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:172 +; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:168 +; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:164 +; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:160 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:156 +; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:152 +; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:148 +; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:144 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen offset:140 +; GFX9-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen offset:136 +; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:132 +; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: struct_i32_v32i32_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:240 +; GFX11-NEXT: buffer_load_b128 v[5:8], off, s[0:3], 0 offset:224 +; GFX11-NEXT: buffer_load_b128 v[9:12], off, s[0:3], 0 offset:208 +; GFX11-NEXT: buffer_load_b128 v[13:16], off, s[0:3], 0 offset:192 +; GFX11-NEXT: buffer_load_b128 v[17:20], off, s[0:3], 0 offset:176 +; GFX11-NEXT: buffer_load_b128 v[21:24], off, s[0:3], 0 offset:160 +; GFX11-NEXT: buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144 +; GFX11-NEXT: buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128 +; GFX11-NEXT: buffer_load_b32 v33, off, s[0:3], 0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_add_i32 s1, s0, 0xf0 +; GFX11-NEXT: s_add_i32 s2, s0, 0xe0 +; GFX11-NEXT: s_add_i32 s3, s0, 0xd0 +; GFX11-NEXT: s_add_i32 s4, s0, 0xc0 +; GFX11-NEXT: s_add_i32 s5, s0, 0xb0 +; GFX11-NEXT: s_add_i32 s6, s0, 0xa0 +; GFX11-NEXT: s_add_i32 s7, s0, 0x90 +; GFX11-NEXT: s_add_i32 s8, s0, 0x80 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b128 off, v[1:4], s1 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: scratch_store_b128 off, v[5:8], s2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: scratch_store_b128 off, v[9:12], s3 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: scratch_store_b128 off, v[13:16], s4 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: scratch_store_b128 off, v[17:20], s5 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: scratch_store_b128 off, v[21:24], s6 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: scratch_store_b128 off, v[25:28], s7 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b128 off, v[29:32], s8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b32 off, v33, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr ret { i32, <32 x i32> }%val } ; Make sure the last struct component is returned in v3, not v4. -; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg: -; GCN: ds_read_b32 v0, -; GCN: ds_read_b32 v1, -; GCN: ds_read_b32 v2, -; GCN: ds_read_b32 v3, define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 { +; CI-LABEL: v3i32_struct_func_void_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_read_b32 v1, v0 +; CI-NEXT: ds_read_b32 v2, v0 +; CI-NEXT: ds_read_b32 v3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v3i32_struct_func_void_wasted_reg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v3i32_struct_func_void_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v0 +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: ds_read_b32 v3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3i32_struct_func_void_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v1, v0 +; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load0 = load volatile i32, ptr addrspace(3) undef %load1 = load volatile i32, ptr addrspace(3) undef %load2 = load volatile i32, ptr addrspace(3) undef @@ -618,12 +2298,53 @@ ret { <3 x i32>, i32 } %insert.4 } -; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg: -; GCN: ds_read_b32 v0, -; GCN: ds_read_b32 v1, -; GCN: ds_read_b32 v2, -; GCN: ds_read_b32 v3, define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 { +; CI-LABEL: v3f32_struct_func_void_wasted_reg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_read_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: ds_read_b32 v1, v0 +; CI-NEXT: ds_read_b32 v2, v0 +; CI-NEXT: ds_read_b32 v3, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v3f32_struct_func_void_wasted_reg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_read_b32 v1, v0 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: ds_read_b32 v3, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v3f32_struct_func_void_wasted_reg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v1, v0 +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: ds_read_b32 v3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v3f32_struct_func_void_wasted_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b32 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_load_b32 v1, v0 +; GFX11-NEXT: ds_load_b32 v2, v0 +; GFX11-NEXT: ds_load_b32 v3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load0 = load volatile float, ptr addrspace(3) undef %load1 = load volatile float, ptr addrspace(3) undef %load2 = load volatile float, ptr addrspace(3) undef @@ -637,14 +2358,54 @@ ret { <3 x float>, i32 } %insert.4 } -; GCN-LABEL: {{^}}void_func_sret_max_known_zero_bits: -; GCN: v_lshrrev_b32_e32 [[LSHR16:v[0-9]+]], 16, v0 -; GCN: ds_write_b32 {{v[0-9]+}}, [[LSHR16]] - -; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0 -; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] -; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]] define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0) #0 { +; CI-LABEL: void_func_sret_max_known_zero_bits: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: ds_write_b32 v0, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: void_func_sret_max_known_zero_bits: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: ds_write_b32 v0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: void_func_sret_max_known_zero_bits: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_sret_max_known_zero_bits: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 17, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %arg0.int = ptrtoint ptr addrspace(5) %arg0 to i32 %lshr0 = lshr i32 %arg0.int, 16 @@ -658,3 +2419,5 @@ } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI,CI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CIVI,VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX11 %s ; half args should be promoted to float for CI and lower. @@ -26,6 +27,17 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: load_f16_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm store half %arg, ptr addrspace(1) %out ret void } @@ -52,26 +64,49 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: load_v2f16_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm store <2 x half> %arg, ptr addrspace(1) %out ret void } define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 { -; GCN-LABEL: load_v3f16_arg: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s0, 4 -; GCN-NEXT: s_addc_u32 s5, s1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v5, s2 -; GCN-NEXT: flat_store_short v[2:3], v4 -; GCN-NEXT: flat_store_dword v[0:1], v5 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: load_v3f16_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_add_u32 s4, s0, 4 +; CIVI-NEXT: s_addc_u32 s5, s1, 0 +; CIVI-NEXT: v_mov_b32_e32 v2, s4 +; CIVI-NEXT: v_mov_b32_e32 v4, s3 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v3, s5 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v5, s2 +; CIVI-NEXT: flat_store_short v[2:3], v4 +; CIVI-NEXT: flat_store_dword v[0:1], v5 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: load_v3f16_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm store <3 x half> %arg, ptr addrspace(1) %out ret void } @@ -79,16 +114,26 @@ ; FIXME: Why not one load? define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 { -; GCN-LABEL: load_v4f16_arg: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; GCN-NEXT: s_endpgm +; CIVI-LABEL: load_v4f16_arg: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: v_mov_b32_e32 v3, s3 +; CIVI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: load_v4f16_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out ret void } @@ -121,6 +166,19 @@ ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: load_v8f16_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm store <8 x half> %arg, ptr addrspace(1) %out ret void } @@ -151,6 +209,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v2f16_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %fpext = fpext <2 x half> %in to <2 x float> store <2 x float> %fpext, ptr addrspace(1) %out ret void @@ -178,6 +250,18 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_f16_to_f32_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext half %arg to float store float %ext, ptr addrspace(1) %out ret void @@ -209,6 +293,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v2f16_to_v2f32_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x float> store <2 x float> %ext, ptr addrspace(1) %out ret void @@ -240,6 +338,19 @@ ; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v3f16_to_v3f32_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> store <3 x float> %ext, ptr addrspace(1) %out ret void @@ -275,6 +386,21 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v4f16_to_v4f32_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s4, s3, 16 +; GFX11-NEXT: s_lshr_b32 s5, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> store <4 x float> %ext, ptr addrspace(1) %out ret void @@ -336,6 +462,31 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v8f16_to_v8f32_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s8, s7, 16 +; GFX11-NEXT: s_lshr_b32 s9, s6, 16 +; GFX11-NEXT: s_lshr_b32 s2, s5, 16 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, s6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s8 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, s9 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, ptr addrspace(1) %out ret void @@ -367,6 +518,20 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_f16_to_f64_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext half %arg to double store double %ext, ptr addrspace(1) %out ret void @@ -404,6 +569,23 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v2f16_to_v2f64_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x double> store <2 x double> %ext, ptr addrspace(1) %out ret void @@ -451,6 +633,26 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v3f16_to_v3f64_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v6 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x double> store <3 x double> %ext, ptr addrspace(1) %out ret void @@ -506,6 +708,29 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v4f16_to_v4f64_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x double> store <4 x double> %ext, ptr addrspace(1) %out ret void @@ -605,97 +830,188 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: extload_v8f16_to_v8f64_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s9, s7, 16 +; GFX11-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v11, s9 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, s8 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, s3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, s2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v6 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v11 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 +; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 +; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x double> store <8 x double> %ext, ptr addrspace(1) %out ret void } define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: global_load_store_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_ushort v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_load_store_f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_load_store_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in store half %val, ptr addrspace(1) %out ret void } define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: global_load_store_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_dword v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_load_store_v2f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_dword v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_dword v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_load_store_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in store <2 x half> %val, ptr addrspace(1) %out ret void } define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { -; GCN-LABEL: global_load_store_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_load_store_v4f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v2, s2 +; CIVI-NEXT: v_mov_b32_e32 v3, s3 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_load_store_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in store <4 x half> %val, ptr addrspace(1) %out ret void } define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: global_load_store_v8f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_load_store_v8f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v4, s0 +; CIVI-NEXT: v_mov_b32_e32 v5, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_load_store_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in store <8 x half> %val, ptr addrspace(1) %out ret void } define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: global_extload_f16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: flat_store_dword v[0:1], v2 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_extload_f16_to_f32: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v0, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: flat_store_dword v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_f16_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %cvt = fpext half %val to float store float %cvt, ptr addrspace(1) %out @@ -733,6 +1049,21 @@ ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v2f16_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %cvt = fpext <2 x half> %val to <2 x float> store <2 x float> %cvt, ptr addrspace(1) %out @@ -772,6 +1103,22 @@ ; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v3f16_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v3, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) %in %cvt = fpext <3 x half> %val to <3 x float> store <3 x float> %cvt, ptr addrspace(1) %out @@ -814,6 +1161,24 @@ ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v4f16_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in %cvt = fpext <4 x half> %val to <4 x float> store <4 x float> %cvt, ptr addrspace(1) %out @@ -876,6 +1241,31 @@ ; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v8f16_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v12, v[4:7], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in %cvt = fpext <8 x half> %val to <8 x float> store <8 x float> %cvt, ptr addrspace(1) %out @@ -990,6 +1380,48 @@ ; VI-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; VI-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v16f16_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v20, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v20, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v20, s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v19, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v17, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v20, v[16:19], s[0:1] offset:48 +; GFX11-NEXT: global_store_b128 v20, v[0:3], s[0:1] offset:32 +; GFX11-NEXT: global_store_b128 v20, v[12:15], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v20, v[8:11], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in %cvt = fpext <16 x half> %val to <16 x float> store <16 x float> %cvt, ptr addrspace(1) %out @@ -997,20 +1429,34 @@ } define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: global_extload_f16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_ushort v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_extload_f16_to_f64: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v0, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v2, s0 +; CIVI-NEXT: v_mov_b32_e32 v3, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; CIVI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_f16_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %cvt = fpext half %val to double store double %cvt, ptr addrspace(1) %out @@ -1052,6 +1498,24 @@ ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v2f16_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in %cvt = fpext <2 x half> %val to <2 x double> store <2 x double> %cvt, ptr addrspace(1) %out @@ -1107,6 +1571,28 @@ ; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v3f16_to_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) %in %cvt = fpext <3 x half> %val to <3 x double> store <3 x double> %cvt, ptr addrspace(1) %out @@ -1167,6 +1653,32 @@ ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v4f16_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in %cvt = fpext <4 x half> %val to <4 x double> store <4 x double> %cvt, ptr addrspace(1) %out @@ -1265,6 +1777,41 @@ ; VI-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v8f16_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v17, v5 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v3 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v6 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48 +; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 +; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in %cvt = fpext <8 x half> %val to <8 x double> store <8 x double> %cvt, ptr addrspace(1) %out @@ -1452,6 +1999,68 @@ ; VI-NEXT: flat_store_dwordx4 v[20:21], v[8:11] ; VI-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_extload_v16f16_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v32, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v32, s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v15, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v14, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v13, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v12, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v18, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v22, v5 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 +; GFX11-NEXT: v_cvt_f32_f16_e32 v10, v23 +; GFX11-NEXT: v_cvt_f32_f16_e32 v34, v11 +; GFX11-NEXT: v_cvt_f32_f16_e32 v11, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v22 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v10 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[24:25], v18 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[26:27], v11 +; GFX11-NEXT: v_cvt_f32_f16_e32 v33, v9 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[20:21], v15 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[22:23], v7 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[16:17], v14 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[18:19], v6 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v8 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v12 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v13 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v3 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v34 +; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v33 +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:80 +; GFX11-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:64 +; GFX11-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112 +; GFX11-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96 +; GFX11-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48 +; GFX11-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 +; GFX11-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v32, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in %cvt = fpext <16 x half> %val to <16 x double> store <16 x double> %cvt, ptr addrspace(1) %out @@ -1459,19 +2068,31 @@ } define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: global_truncstore_f32_to_f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_dword v0, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: global_truncstore_f32_to_f16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_dword v0, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_cvt_f16_f32_e32 v2, v0 +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: global_truncstore_f32_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in %cvt = fptrunc float %val to half store half %cvt, ptr addrspace(1) %out @@ -1511,6 +2132,21 @@ ; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_truncstore_v2f32_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <2 x float>, ptr addrspace(1) %in %cvt = fptrunc <2 x float> %val to <2 x half> store <2 x half> %cvt, ptr addrspace(1) %out @@ -1562,6 +2198,24 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_truncstore_v3f32_to_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b96 v[0:2], v3, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 +; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <3 x float>, ptr addrspace(1) %in %cvt = fptrunc <3 x float> %val to <3 x half> store <3 x half> %cvt, ptr addrspace(1) %out @@ -1608,6 +2262,24 @@ ; VI-NEXT: v_or_b32_e32 v2, v5, v4 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_truncstore_v4f32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pack_b32_f16 v1, v2, v3 +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v5 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <4 x float>, ptr addrspace(1) %in %cvt = fptrunc <4 x float> %val to <4 x half> store <4 x half> %cvt, ptr addrspace(1) %out @@ -1680,6 +2352,33 @@ ; VI-NEXT: v_or_b32_e32 v2, v4, v5 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_truncstore_v8f32_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_pack_b32_f16 v3, v2, v3 +; GFX11-NEXT: v_pack_b32_f16 v2, v0, v1 +; GFX11-NEXT: v_pack_b32_f16 v1, v6, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <8 x float>, ptr addrspace(1) %in %cvt = fptrunc <8 x float> %val to <8 x half> store <8 x half> %cvt, ptr addrspace(1) %out @@ -1815,6 +2514,50 @@ ; VI-NEXT: v_or_b32_e32 v2, v12, v13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: global_truncstore_v16f32_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b128 v[0:3], v16, s[2:3] offset:16 +; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] +; GFX11-NEXT: global_load_b128 v[8:11], v16, s[2:3] offset:48 +; GFX11-NEXT: global_load_b128 v[12:15], v16, s[2:3] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v17, v5 +; GFX11-NEXT: v_cvt_f16_f32_e32 v18, v4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v11 +; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v10 +; GFX11-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GFX11-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v10, v15 +; GFX11-NEXT: v_cvt_f16_f32_e32 v11, v14 +; GFX11-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GFX11-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GFX11-NEXT: v_pack_b32_f16 v3, v2, v3 +; GFX11-NEXT: v_pack_b32_f16 v2, v0, v1 +; GFX11-NEXT: v_pack_b32_f16 v1, v6, v7 +; GFX11-NEXT: v_pack_b32_f16 v7, v5, v4 +; GFX11-NEXT: v_pack_b32_f16 v6, v8, v9 +; GFX11-NEXT: v_pack_b32_f16 v5, v11, v10 +; GFX11-NEXT: v_pack_b32_f16 v4, v12, v13 +; GFX11-NEXT: v_pack_b32_f16 v0, v18, v17 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load <16 x float>, ptr addrspace(1) %in %cvt = fptrunc <16 x float> %val to <16 x half> store <16 x half> %cvt, ptr addrspace(1) %out @@ -1851,6 +2594,20 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd half %a, %b store half %add, ptr addrspace(1) %out, align 4 ret void @@ -1894,6 +2651,16 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <2 x half> %a, %b store <2 x half> %add, ptr addrspace(1) %out, align 8 ret void @@ -1955,6 +2722,19 @@ ; VI-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v1, v1, v3 +; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1 %a = load <4 x half>, ptr addrspace(1) %in, align 16 %b = load <4 x half>, ptr addrspace(1) %b_ptr, align 16 @@ -2063,24 +2843,50 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm +; +; GFX11-LABEL: fadd_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v3, s7, s11 +; GFX11-NEXT: v_pk_add_f16 v2, s6, s10 +; GFX11-NEXT: v_pk_add_f16 v1, s5, s9 +; GFX11-NEXT: v_pk_add_f16 v0, s4, s8 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = fadd <8 x half> %a, %b store <8 x half> %add, ptr addrspace(1) %out, align 32 ret void } define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { -; GCN-LABEL: test_bitcast_from_half: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: flat_load_ushort v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: test_bitcast_from_half: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: flat_load_ushort v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: test_bitcast_from_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in %val_int = bitcast half %val to i16 store i16 %val_int, ptr addrspace(1) %out @@ -2088,18 +2894,29 @@ } define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { -; GCN-LABEL: test_bitcast_to_half: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: flat_load_ushort v2, v[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_endpgm +; CIVI-LABEL: test_bitcast_to_half: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: test_bitcast_to_half: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to half store half %val_fp, ptr addrspace(1) %out @@ -2107,3 +2924,5 @@ } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -1,12 +1,24 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc -;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +; GFX68-LABEL: buffer_store: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX68-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc +; GFX68-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 glc +; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -14,34 +26,65 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +; GFX68-LABEL: buffer_store_immoffs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_immoffs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +; GFX68-LABEL: buffer_store_ofs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_ofs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +; VERDE-LABEL: buffer_store_wait: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt expcnt(0) +; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_wait: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_wait: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen +; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 offen +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0) @@ -49,29 +92,52 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) { +; GFX68-LABEL: buffer_store_x1: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 { +; GFX68-LABEL: buffer_store_x2: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offen_merged_and: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offen_merged_and: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -87,11 +153,22 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offen_merged_or: +; GFX68: ; %bb.0: +; GFX68-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offen_merged_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 6, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 6 %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 @@ -108,13 +185,22 @@ ret void } - -;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offen_merged_glc_slc: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; GFX68-NEXT: buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offen_merged_glc_slc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc +; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 8 %a3 = add i32 %a, 12 @@ -130,10 +216,17 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) { +; GFX68-LABEL: buffer_store_x2_offen_merged_and: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2_offen_merged_and: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0) @@ -141,10 +234,19 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) { +; GFX68-LABEL: buffer_store_x2_offen_merged_or: +; GFX68: ; %bb.0: +; GFX68-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX68-NEXT: buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2_offen_merged_or: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 4 %a1 = add i32 %a, 4 %a2 = add i32 %a, 12 @@ -153,11 +255,20 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: buffer_store_x1_offset_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1_offset_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -167,21 +278,38 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) { +; GFX68-LABEL: buffer_store_x2_offset_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2_offset_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc -;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +; GFX68-LABEL: buffer_store_int: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc +; GFX68-NEXT: buffer_store_dword v6, off, s[0:3], 0 slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_int: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 glc +; GFX11-NEXT: buffer_store_b32 v6, off, s[0:3], 0 slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1) @@ -189,12 +317,19 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) { +; GFX68-LABEL: raw_buffer_store_byte: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_byte: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -202,12 +337,19 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_short: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) { +; GFX68-LABEL: raw_buffer_store_short: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_short: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -215,12 +357,17 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) { +; GFX68-LABEL: raw_buffer_store_f16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_f16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 %cast = bitcast i16 %trunc to half @@ -228,59 +375,142 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_v2f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v2f16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v2f16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4f16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4f16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 +; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v4f16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v4f16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NOT: v0 -;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -;CHECK-NEXT: s_endpgm define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) { +; GFX68-LABEL: raw_buffer_store_i16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %trunc = trunc i32 %v1 to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v2i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) { +; VERDE-LABEL: buffer_store_v2i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v2i16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v2i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_v4i16: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 { +; VERDE-LABEL: buffer_store_v4i16: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_v4i16: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_v4i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_merged: -;CHECK-NOT: s_waitcnt -;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: raw_buffer_store_x1_offset_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_x1_offset_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) @@ -290,14 +520,28 @@ ret void } -;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_swizzled_not_merged: -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 -;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX68: ; %bb.0: +; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 +; GFX68-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:12 +; GFX68-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:16 +; GFX68-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:28 +; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 offset:8 +; GFX11-NEXT: buffer_store_b32 v2, off, s[0:3], 0 offset:12 +; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 offset:16 +; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 offset:28 +; GFX11-NEXT: buffer_store_b32 v5, off, s[0:3], 0 offset:32 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -1,12 +1,26 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s + define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +; GFX68-LABEL: buffer_store: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v12, 0 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen +; GFX68-NEXT: buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc +; GFX68-NEXT: buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], 0 idxen +; GFX11-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc +; GFX11-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -14,62 +28,123 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +; GFX68-LABEL: buffer_store_immoffs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v4, 0 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42 +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_immoffs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_idx: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { +; GFX68-LABEL: buffer_store_idx: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_idx: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { +; GFX68-LABEL: buffer_store_ofs: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: s_mov_b32 s4, 0 +; GFX68-NEXT: v_mov_b32_e32 v5, v4 +; GFX68-NEXT: v_mov_b32_e32 v4, s4 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_ofs: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 +; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { +; GFX68-LABEL: buffer_store_both: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_both: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both_reversed: -;CHECK: v_mov_b32_e32 v6, v4 -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { +; GFX68-LABEL: buffer_store_both_reversed: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v6, v4 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_both_reversed: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { +; VERDE-LABEL: buffer_store_wait: +; VERDE: ; %bb.0: ; %main_body +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; VERDE-NEXT: s_waitcnt expcnt(0) +; VERDE-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: buffer_store_wait: +; GFX8: ; %bb.0: ; %main_body +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; GFX8-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_wait: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen +; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 idxen +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0) @@ -77,30 +152,56 @@ ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { +; GFX68-LABEL: buffer_store_x1: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x1: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +; GFX68-LABEL: buffer_store_x2: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_x2: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) { +; GFX68-LABEL: buffer_store_int: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_mov_b32_e32 v7, 0 +; GFX68-NEXT: buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen +; GFX68-NEXT: buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc +; GFX68-NEXT: buffer_store_dword v6, v7, s[0:3], 0 idxen slc +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: buffer_store_int: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], 0 idxen +; GFX11-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc +; GFX11-NEXT: buffer_store_b32 v6, v7, s[0:3], 0 idxen slc +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1) @@ -108,12 +209,19 @@ ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +; GFX68-LABEL: struct_buffer_store_byte: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_byte v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_byte: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b8 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -121,39 +229,89 @@ ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +; GFX68-LABEL: struct_buffer_store_f16: +; GFX68: ; %bb.0: +; GFX68-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %v2 = fptrunc float %v1 to half call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_v2f16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_v2f16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_v4f16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_v4f16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_cvt_f16_f32_e32 v3, v3 +; VERDE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VERDE-NEXT: v_cvt_f16_f32_e32 v5, v1 +; VERDE-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v2, v1 +; VERDE-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v2 +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) { +; GFX68-LABEL: struct_buffer_store_i16: +; GFX68: ; %bb.0: ; %main_body +; GFX68-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX68-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen +; GFX68-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_i16: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -161,18 +319,51 @@ ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_vif16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_vif16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v0, v0, v1 +; VERDE-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_vif16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_vif16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_buffer_store_v4i16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) { +; VERDE-LABEL: struct_buffer_store_v4i16: +; VERDE: ; %bb.0: +; VERDE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; VERDE-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; VERDE-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VERDE-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; VERDE-NEXT: v_or_b32_e32 v2, v2, v3 +; VERDE-NEXT: v_or_b32_e32 v1, v0, v1 +; VERDE-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen +; VERDE-NEXT: s_endpgm +; +; GFX8-LABEL: struct_buffer_store_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; GFX8-NEXT: s_endpgm +; +; GFX11-LABEL: struct_buffer_store_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } @@ -192,6 +383,5 @@ declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) #0 declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) #0 - attributes #0 = { nounwind } attributes #1 = { nounwind readonly }