diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define <2 x i16> @v_add_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX9-LABEL: v_add_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add_shl.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_ADD_LSHL_U32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) { ; GCN-LABEL: s_andn2_i32: @@ -13,6 +14,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 ret i32 %and @@ -28,6 +34,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %not.src1, %src0 ret i32 %and @@ -45,6 +56,12 @@ ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: s_not_b32 s1, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i32_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: s_not_b32 s1, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 %insert.0 = insertvalue { i32, i32 } undef, i32 %and, 0 @@ -64,6 +81,12 @@ ; GFX10-NEXT: s_andn2_b32 s0, s2, s4 ; GFX10-NEXT: s_andn2_b32 s1, s3, s4 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i32_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s4 +; GFX11-NEXT: s_and_not1_b32 s1, s3, s4 +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor i32 %src2, -1 %and0 = and i32 %src0, %not.src2 %and1 = and i32 %src1, %not.src2 @@ -80,13 +103,13 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_andn2_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_andn2_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 ret i32 %and @@ -99,11 +122,11 @@ ; GCN-NEXT: v_and_b32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_andn2_i32_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_andn2_i32_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 %cast = bitcast i32 %and to float @@ -117,11 +140,11 @@ ; GCN-NEXT: v_and_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_andn2_i32_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s0, s2 -; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_andn2_i32_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_not_b32 s0, s2 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %and = and i32 %src0, %not.src1 %cast = bitcast i32 %and to float @@ -138,6 +161,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 ret i64 %and @@ -153,6 +181,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i64_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %not.src1, %src0 ret i64 %and @@ -170,6 +203,12 @@ ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[6:7] ; GFX10-NEXT: s_andn2_b64 s[2:3], s[4:5], s[6:7] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i64_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[2:3], s[6:7] +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[4:5], s[6:7] +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor i64 %src2, -1 %and0 = and i64 %src0, %not.src2 %and1 = and i64 %src1, %not.src2 @@ -192,6 +231,12 @@ ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: s_not_b64 s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: s_not_b64 s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 %insert.0 = insertvalue { i64, i64 } undef, i64 %and, 0 @@ -209,15 +254,15 @@ ; GCN-NEXT: v_and_b32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_andn2_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_andn2_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 ret i64 %and @@ -232,13 +277,13 @@ ; GCN-NEXT: v_and_b32_e32 v1, s3, v1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_andn2_i64_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_andn2_i64_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 %cast = bitcast i64 %and to <2 x float> @@ -253,12 +298,12 @@ ; GCN-NEXT: v_and_b32_e32 v1, s1, v1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_andn2_i64_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b64 s[0:1], s[2:3] -; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_andn2_i64_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_not_b64 s[0:1], s[2:3] +; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, s1, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %and = and i64 %src0, %not.src1 %cast = bitcast i64 %and to <2 x float> @@ -275,6 +320,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %and = and <2 x i32> %src0, %not.src1 ret <2 x i32> %and @@ -290,6 +340,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_v2i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %and = and <2 x i32> %not.src1, %src0 ret <2 x i32> %and @@ -305,6 +360,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 ret i16 %and @@ -320,6 +380,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i16_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %not.src1, %src0 ret i16 %and @@ -337,6 +402,12 @@ ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: s_xor_b32 s1, s3, -1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i16_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: s_xor_b32 s1, s3, -1 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %insert.0 = insertvalue { i16, i16 } undef, i16 %and, 0 @@ -356,6 +427,12 @@ ; GFX10-NEXT: s_andn2_b32 s0, s2, s4 ; GFX10-NEXT: s_andn2_b32 s1, s3, s4 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_i16_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s4 +; GFX11-NEXT: s_and_not1_b32 s1, s3, s4 +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor i16 %src2, -1 %and0 = and i16 %src0, %not.src2 %and1 = and i16 %src1, %not.src2 @@ -372,13 +449,13 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_andn2_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_andn2_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 ret i16 %and @@ -392,12 +469,12 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_andn2_i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_andn2_i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -413,12 +490,12 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_andn2_i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s0, s2, -1 -; GFX10-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_andn2_i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %and = and i16 %src0, %not.src1 %zext = zext i16 %and to i32 @@ -448,6 +525,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 %cast = bitcast <2 x i16> %and to i32 @@ -476,6 +558,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_v2i16_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %not.src1, %src0 %cast = bitcast <2 x i16> %and to i32 @@ -506,6 +593,12 @@ ; GFX10-NEXT: s_andn2_b32 s0, s2, s3 ; GFX10-NEXT: s_xor_b32 s1, s3, -1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_v2i16_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s3 +; GFX11-NEXT: s_xor_b32 s1, s3, -1 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 @@ -544,6 +637,12 @@ ; GFX10-NEXT: s_andn2_b32 s0, s2, s4 ; GFX10-NEXT: s_andn2_b32 s1, s3, s4 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_andn2_v2i16_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s2, s4 +; GFX11-NEXT: s_and_not1_b32 s1, s3, s4 +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor <2 x i16> %src2, %and0 = and <2 x i16> %src0, %not.src2 %and1 = and <2 x i16> %src1, %not.src2 @@ -577,13 +676,13 @@ ; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_andn2_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_andn2_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <2 x i16> %src1, %and = and <2 x i16> %src0, %not.src1 ret <2 x i16> %and @@ -650,13 +749,13 @@ ; GFX9-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_andn2_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_andn2_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %src0, %not.src1 %cast = bitcast <4 x i16> %and to i64 @@ -692,13 +791,13 @@ ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_andn2_v4i16_commute: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_andn2_v4i16_commute: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %not.src1, %src0 %cast = bitcast <4 x i16> %and to i64 @@ -736,15 +835,15 @@ ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_andn2_v4i16_multi_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_andn2_v4i16_multi_use: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %src0, %not.src1 @@ -792,14 +891,14 @@ ; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_andn2_v4i16_multi_foldable_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] -; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_andn2_v4i16_multi_foldable_use: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[6:7] +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7] +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src2 = xor <4 x i16> %src2, %and0 = and <4 x i16> %src0, %not.src2 %and1 = and <4 x i16> %src1, %not.src2 @@ -844,15 +943,15 @@ ; GFX9-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_andn2_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_andn2_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <4 x i16> %src1, %and = and <4 x i16> %src0, %not.src1 ret <4 x i16> %and diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_ashr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_ashr_i8: @@ -26,14 +27,14 @@ ; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX10-NEXT: v_ashrrev_i16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i8 %value, %amount ret i8 %result } @@ -60,13 +61,13 @@ ; GFX9-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i8_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX10-NEXT: v_ashrrev_i16 v0, 7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 7, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i8 %value, 7 ret i8 %result } @@ -92,12 +93,12 @@ ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_sext_i32_i8 s1, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 +; GFX10PLUS-NEXT: s_sext_i32_i8 s1, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i8 %value, %amount ret i8 %result } @@ -109,11 +110,11 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i8_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i8 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, 7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i8 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i8 %value, 7 ret i8 %result } @@ -128,14 +129,14 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i24: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i24: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i24 %value, %amount ret i24 %result } @@ -148,13 +149,13 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 7, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i24_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 24 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 7, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i24 %value, 7 ret i24 %result } @@ -166,11 +167,11 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i24: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GFX10-NEXT: s_ashr_i32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i24: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i24 %value, %amount ret i24 %result } @@ -182,11 +183,11 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i24_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GFX10-NEXT: s_ashr_i32 s0, s0, 7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i32 s0, s0, 0x180000 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i24 %value, 7 ret i24 %result } @@ -198,12 +199,12 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i32 %value, %amount ret i32 %result } @@ -215,12 +216,12 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i32 %value, 31 ret i32 %result } @@ -231,10 +232,10 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i32 %value, %amount ret i32 %result } @@ -245,10 +246,10 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i32 %value, 31 ret i32 %result } @@ -269,10 +270,10 @@ ; GFX9-NEXT: v_ashrrev_i32_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_i32_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i32_e64 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_i32_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_ashrrev_i32_e64 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -284,10 +285,10 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_i32_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i32_e32 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_i32_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -301,13 +302,13 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, v2, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v2, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, v3, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -320,13 +321,13 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v2i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v2i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i32> %value, ret <2 x i32> %result } @@ -338,11 +339,11 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 -; GFX10-NEXT: s_ashr_i32 s1, s1, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -356,14 +357,14 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v2, v5, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, v3, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, v4, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, v5, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, v4, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, v5, v2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -376,12 +377,12 @@ ; GCN-NEXT: s_ashr_i32 s2, s2, s5 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, s3 -; GFX10-NEXT: s_ashr_i32 s1, s1, s4 -; GFX10-NEXT: s_ashr_i32 s2, s2, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s3 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s4 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -396,15 +397,15 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, v4, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, v5, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, v6, v2 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, v7, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v4, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, v5, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, v6, v2 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v3, v7, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -418,13 +419,13 @@ ; GCN-NEXT: s_ashr_i32 s3, s3, s7 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, s4 -; GFX10-NEXT: s_ashr_i32 s1, s1, s5 -; GFX10-NEXT: s_ashr_i32 s2, s2, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, s7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s4 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s5 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s6 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, s7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -440,16 +441,16 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v4, v9, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v5i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, v5, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, v6, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, v7, v2 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, v8, v3 -; GFX10-NEXT: v_ashrrev_i32_e32 v4, v9, v4 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v5i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, v5, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, v6, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, v7, v2 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v3, v8, v3 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v4, v9, v4 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -464,14 +465,14 @@ ; GCN-NEXT: s_ashr_i32 s4, s4, s9 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v5i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, s5 -; GFX10-NEXT: s_ashr_i32 s1, s1, s6 -; GFX10-NEXT: s_ashr_i32 s2, s2, s7 -; GFX10-NEXT: s_ashr_i32 s3, s3, s8 -; GFX10-NEXT: s_ashr_i32 s4, s4, s9 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v5i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s5 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s6 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s7 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, s8 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, s9 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -523,6 +524,30 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ashrrev_i32_e32 v15, v31, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ashr_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_ashrrev_i32_e32 v0, v16, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, v17, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, v18, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, v19, v3 +; GFX11-NEXT: v_ashrrev_i32_e32 v4, v20, v4 +; GFX11-NEXT: v_ashrrev_i32_e32 v5, v21, v5 +; GFX11-NEXT: v_ashrrev_i32_e32 v6, v22, v6 +; GFX11-NEXT: v_ashrrev_i32_e32 v7, v23, v7 +; GFX11-NEXT: v_ashrrev_i32_e32 v8, v24, v8 +; GFX11-NEXT: v_ashrrev_i32_e32 v9, v25, v9 +; GFX11-NEXT: v_ashrrev_i32_e32 v10, v26, v10 +; GFX11-NEXT: v_ashrrev_i32_e32 v11, v27, v11 +; GFX11-NEXT: v_ashrrev_i32_e32 v12, v28, v12 +; GFX11-NEXT: v_ashrrev_i32_e32 v13, v29, v13 +; GFX11-NEXT: v_ashrrev_i32_e32 v14, v30, v14 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v15, v31, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = ashr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -548,25 +573,25 @@ ; GCN-NEXT: s_ashr_i32 s15, s15, s31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v16i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s0, s16 -; GFX10-NEXT: s_ashr_i32 s1, s1, s17 -; GFX10-NEXT: s_ashr_i32 s2, s2, s18 -; GFX10-NEXT: s_ashr_i32 s3, s3, s19 -; GFX10-NEXT: s_ashr_i32 s4, s4, s20 -; GFX10-NEXT: s_ashr_i32 s5, s5, s21 -; GFX10-NEXT: s_ashr_i32 s6, s6, s22 -; GFX10-NEXT: s_ashr_i32 s7, s7, s23 -; GFX10-NEXT: s_ashr_i32 s8, s8, s24 -; GFX10-NEXT: s_ashr_i32 s9, s9, s25 -; GFX10-NEXT: s_ashr_i32 s10, s10, s26 -; GFX10-NEXT: s_ashr_i32 s11, s11, s27 -; GFX10-NEXT: s_ashr_i32 s12, s12, s28 -; GFX10-NEXT: s_ashr_i32 s13, s13, s29 -; GFX10-NEXT: s_ashr_i32 s14, s14, s30 -; GFX10-NEXT: s_ashr_i32 s15, s15, s31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v16i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s16 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s17 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s18 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, s19 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, s20 +; GFX10PLUS-NEXT: s_ashr_i32 s5, s5, s21 +; GFX10PLUS-NEXT: s_ashr_i32 s6, s6, s22 +; GFX10PLUS-NEXT: s_ashr_i32 s7, s7, s23 +; GFX10PLUS-NEXT: s_ashr_i32 s8, s8, s24 +; GFX10PLUS-NEXT: s_ashr_i32 s9, s9, s25 +; GFX10PLUS-NEXT: s_ashr_i32 s10, s10, s26 +; GFX10PLUS-NEXT: s_ashr_i32 s11, s11, s27 +; GFX10PLUS-NEXT: s_ashr_i32 s12, s12, s28 +; GFX10PLUS-NEXT: s_ashr_i32 s13, s13, s29 +; GFX10PLUS-NEXT: s_ashr_i32 s14, s14, s30 +; GFX10PLUS-NEXT: s_ashr_i32 s15, s15, s31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -592,12 +617,12 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i16 %value, %amount ret i16 %result } @@ -608,11 +633,11 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i16_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i16_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i16 %value, 31 ret i16 %result } @@ -638,12 +663,12 @@ ; GFX9-NEXT: s_ashr_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-NEXT: s_ashr_i32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_sext_i32_i16 s1, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount ret i16 %result } @@ -655,11 +680,11 @@ ; GCN-NEXT: s_ashr_i32 s0, s0, 15 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i16_15: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, 15 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 15 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, 15 ret i16 %result } @@ -682,10 +707,10 @@ ; GFX9-NEXT: v_ashrrev_i16_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i16 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -709,10 +734,10 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i16 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -744,12 +769,12 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_ashrrev_i16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, %amount ret <2 x i16> %result } @@ -779,12 +804,12 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v2i16_15: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v2i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, ret <2 x i16> %result } @@ -826,16 +851,16 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, 16 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, s3 -; GFX10-NEXT: s_ashr_i32 s0, s0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s2, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10PLUS-NEXT: s_sext_i32_i16 s3, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s3 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s1 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -870,10 +895,10 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_v2i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_ashrrev_i16 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_v2i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -908,10 +933,10 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_v2i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_ashrrev_i16 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_v2i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -972,13 +997,13 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_ashrrev_i16 v0, v2, v0 -; GFX10-NEXT: v_pk_ashrrev_i16 v1, v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, v2, v0 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v1, v3, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -1045,23 +1070,23 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s4, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, 16 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, s5 -; GFX10-NEXT: s_ashr_i32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, s5 -; GFX10-NEXT: s_ashr_i32 s1, s1, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s4, s0 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i16 s4, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10PLUS-NEXT: s_sext_i32_i16 s5, s2 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, s5 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s2 +; GFX10PLUS-NEXT: s_sext_i32_i16 s2, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10PLUS-NEXT: s_sext_i32_i16 s5, s3 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s5 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s3 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s2, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast @@ -1162,15 +1187,15 @@ ; GFX9-NEXT: v_pk_ashrrev_i16 v3, v7, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v8i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_ashrrev_i16 v0, v4, v0 -; GFX10-NEXT: v_pk_ashrrev_i16 v1, v5, v1 -; GFX10-NEXT: v_pk_ashrrev_i16 v2, v6, v2 -; GFX10-NEXT: v_pk_ashrrev_i16 v3, v7, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v8i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v0, v4, v0 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v1, v5, v1 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v2, v6, v2 +; GFX10PLUS-NEXT: v_pk_ashrrev_i16 v3, v7, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x float> ret <4 x float> %cast @@ -1285,37 +1310,37 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v8i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sext_i32_i16 s8, s0 -; GFX10-NEXT: s_ashr_i32 s0, s0, 16 -; GFX10-NEXT: s_sext_i32_i16 s9, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, s9 -; GFX10-NEXT: s_ashr_i32 s0, s0, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_sext_i32_i16 s9, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, s9 -; GFX10-NEXT: s_ashr_i32 s1, s1, s5 -; GFX10-NEXT: s_sext_i32_i16 s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, s5 -; GFX10-NEXT: s_ashr_i32 s2, s2, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s7 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s8, s0 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v8i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sext_i32_i16 s8, s0 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10PLUS-NEXT: s_sext_i32_i16 s9, s4 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s8, s8, s9 +; GFX10PLUS-NEXT: s_ashr_i32 s0, s0, s4 +; GFX10PLUS-NEXT: s_sext_i32_i16 s4, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10PLUS-NEXT: s_sext_i32_i16 s9, s5 +; GFX10PLUS-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, s9 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, s5 +; GFX10PLUS-NEXT: s_sext_i32_i16 s5, s6 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s4, s1 +; GFX10PLUS-NEXT: s_sext_i32_i16 s4, s2 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s4, s4, s5 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s2, s6 +; GFX10PLUS-NEXT: s_sext_i32_i16 s5, s3 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10PLUS-NEXT: s_sext_i32_i16 s6, s7 +; GFX10PLUS-NEXT: s_ashr_i32 s7, s7, 16 +; GFX10PLUS-NEXT: s_ashr_i32 s5, s5, s6 +; GFX10PLUS-NEXT: s_ashr_i32 s3, s3, s7 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s8, s0 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s5, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> ret <4 x i32> %cast @@ -1340,12 +1365,12 @@ ; GFX9-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, %amount ret i64 %result } @@ -1358,13 +1383,13 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i64_63: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 31, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 63 ret i64 %result } @@ -1378,13 +1403,13 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i64_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 1, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v0, 1, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 33 ret i64 %result } @@ -1397,13 +1422,13 @@ ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i64_32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i64_32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 32 ret i64 %result } @@ -1427,12 +1452,12 @@ ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i64 %value, 31 ret i64 %result } @@ -1443,10 +1468,10 @@ ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, %amount ret i64 %result } @@ -1458,11 +1483,11 @@ ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i64_63: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s1, 31 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s1, 31 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, 63 ret i64 %result } @@ -1475,11 +1500,11 @@ ; GCN-NEXT: s_mov_b32 s1, s2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i64_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i32 s0, s1, 1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i32 s0, s1, 1 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, 33 ret i64 %result } @@ -1491,11 +1516,11 @@ ; GCN-NEXT: s_ashr_i32 s1, s1, 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i64_32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s1 -; GFX10-NEXT: s_ashr_i32 s1, s1, 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i64_32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s1 +; GFX10PLUS-NEXT: s_ashr_i32 s1, s1, 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, 32 ret i64 %result } @@ -1506,10 +1531,10 @@ ; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i64 s[0:1], s[0:1], 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, 31 ret i64 %result } @@ -1530,10 +1555,10 @@ ; GFX9-NEXT: v_ashrrev_i64 v[0:1], v0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_i64_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i64 v[0:1], v0, s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_i64_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_ashrrev_i64 v[0:1], v0, s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1555,10 +1580,10 @@ ; GFX9-NEXT: v_ashrrev_i64 v[0:1], s0, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: ashr_i64_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_ashrrev_i64 v[0:1], s0, v[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: ashr_i64_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_ashrrev_i64 v[0:1], s0, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1586,13 +1611,13 @@ ; GFX9-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v2i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] +; GFX10PLUS-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1619,13 +1644,13 @@ ; GFX9-NEXT: v_ashrrev_i64 v[2:3], 31, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_v2i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] -; GFX10-NEXT: v_ashrrev_i64 v[2:3], 31, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_v2i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1] +; GFX10PLUS-NEXT: v_ashrrev_i64 v[2:3], 31, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i64> %value, ret <2 x i64> %result } @@ -1637,11 +1662,11 @@ ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s6 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_ashr_i64 s[0:1], s[0:1], s4 -; GFX10-NEXT: s_ashr_i64 s[2:3], s[2:3], s6 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_v2i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_ashr_i64 s[0:1], s[0:1], s4 +; GFX10PLUS-NEXT: s_ashr_i64 s[2:3], s[2:3], s6 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1739,6 +1764,30 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v4, v1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ashr_i65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfe_i32 v4, v2, 0, 1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 +; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] +; GFX11-NEXT: v_ashrrev_i64 v[10:11], v10, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v7, v9 +; GFX11-NEXT: v_ashrrev_i64 v[6:7], v3, v[4:5] +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v4, v11, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v4, v1, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = ashr i65 %value, %amount ret i65 %result } @@ -1780,18 +1829,18 @@ ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ashr_i65_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_bfe_i32 v1, v2, 0, 1 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] -; GFX10-NEXT: v_ashrrev_i32_e32 v2, 1, v2 -; GFX10-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ashr_i65_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v1 +; GFX10PLUS-NEXT: v_bfe_i32 v1, v2, 0, 1 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = ashr i65 %value, 33 ret i65 %result } @@ -1821,29 +1870,29 @@ ; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], s[8:9] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i65: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 -; GFX10-NEXT: s_sub_i32 s12, s3, 64 -; GFX10-NEXT: s_sub_i32 s8, 64, s3 -; GFX10-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s14, 1, 0 -; GFX10-NEXT: s_ashr_i64 s[6:7], s[4:5], s3 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_ashr_i32 s10, s5, 31 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_ashr_i64 s[4:5], s[4:5], s12 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_mov_b32 s11, s10 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s14, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[10:11] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i65: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_sub_i32 s12, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s8, 64, s3 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s14, 1, 0 +; GFX10PLUS-NEXT: s_ashr_i64 s[6:7], s[4:5], s3 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX10PLUS-NEXT: s_ashr_i32 s10, s5, 31 +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10PLUS-NEXT: s_ashr_i64 s[4:5], s[4:5], s12 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10PLUS-NEXT: s_mov_b32 s11, s10 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[6:7], s[10:11] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, %amount ret i65 %result } @@ -1859,15 +1908,15 @@ ; GCN-NEXT: s_ashr_i32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ashr_i65_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 -; GFX10-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GFX10-NEXT: s_ashr_i32 s2, s3, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ashr_i65_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_ashr_i32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = ashr i65 %value, 33 ret i65 %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) { ; GFX7-LABEL: s_bswap_i32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s ; Make sure we don't violate the constant bus restriction @@ -11,10 +12,10 @@ ; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fmul_s_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mul_f32_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fmul_s_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mul_f32_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = fmul float %src0, %src1 ret float %result } @@ -25,10 +26,10 @@ ; GFX9-NEXT: v_mul_f32_e64 v0, s2, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fmul_ss: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mul_f32_e64 v0, s2, s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fmul_ss: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mul_f32_e64 v0, s2, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = fmul float %src, %src ret float %result } @@ -42,11 +43,11 @@ ; GFX9-NEXT: v_fma_f32 v0, s2, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fma_s_s_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_fma_f32 v0, s3, s2, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fma_s_s_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s4 +; GFX10PLUS-NEXT: v_fma_f32 v0, s3, s2, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src0, float %src1, float %src2) ret float %result } @@ -58,10 +59,10 @@ ; GFX9-NEXT: v_fma_f32 v0, s2, s2, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fma_sss: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_fma_f32 v0, s2, s2, s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fma_sss: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_fma_f32 v0, s2, s2, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src, float %src, float %src) ret float %result } @@ -74,10 +75,10 @@ ; GFX9-NEXT: v_fma_f32 v0, s2, s2, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fma_ss_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_fma_f32 v0, s2, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fma_ss_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_fma_f32 v0, s2, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src01, float %src01, float %src2) ret float %result } @@ -90,10 +91,10 @@ ; GFX9-NEXT: v_fma_f32 v0, s2, v0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fma_s_ss: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_fma_f32 v0, s2, s3, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fma_s_ss: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_fma_f32 v0, s2, s3, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src0, float %src12, float %src12) ret float %result } @@ -106,10 +107,10 @@ ; GFX9-NEXT: v_fma_f32 v0, s2, v0, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fma_ss_s_same_outer: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_fma_f32 v0, s2, s3, s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fma_ss_s_same_outer: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_fma_f32 v0, s2, s3, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = call float @llvm.fma.f32(float %src02, float %src1, float %src02) ret float %result } @@ -122,11 +123,11 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: fcmp_s_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_cmp_eq_f32_e64 s0, s2, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: fcmp_s_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_cmp_eq_f32_e64 s0, s2, s3 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %cmp = fcmp oeq float %src0, %src1 %result = select i1 %cmp, float 1.0, float 0.0 ret float %result @@ -141,12 +142,12 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: select_vcc_s_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, s2, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: select_vcc_s_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s3 +; GFX10PLUS-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, s2, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog %cmp = fcmp oeq float %cmp0, %cmp1 %result = select i1 %cmp, float %src0, float %src1 ret float %result @@ -161,12 +162,12 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, -v3, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: select_vcc_fneg_s_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, s3, -v2, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: select_vcc_fneg_s_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s2 +; GFX10PLUS-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s3, -v2, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog %cmp = fcmp oeq float %cmp0, %cmp1 %neg.src0 = fneg float %src0 %result = select i1 %cmp, float %neg.src0, float %src1 @@ -183,11 +184,11 @@ ; GFX9-NEXT: v_div_fmas_f32 v0, v0, v0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: amdgcn_div_fmas_sss: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_div_fmas_f32 v0, s2, s2, s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: amdgcn_div_fmas_sss: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 +; GFX10PLUS-NEXT: v_div_fmas_f32 v0, s2, s2, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %vcc = fcmp oeq float %cmp.src, 0.0 %result = call float @llvm.amdgcn.div.fmas.f32(float %src, float %src, float %src, i1 %vcc) ret float %result @@ -201,11 +202,11 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: class_s_s: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_cmp_class_f32_e64 s0, s2, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: class_s_s: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_cmp_class_f32_e64 s0, s2, s3 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %class = call i1 @llvm.amdgcn.class.f32(float %src0, i32 %src1) %result = select i1 %class, float 1.0, float 0.0 ret float %result @@ -222,6 +223,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s3, s2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: div_scale_s_s_true: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 +; GFX11-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 true) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result @@ -238,6 +244,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_div_scale_f32 v0, s0, s3, s3, s2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: div_scale_s_s_false: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 +; GFX11-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 false) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s @gv = external addrspace(4) constant i32 @@ -38,6 +39,23 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s32, 16 +; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 4, addrspace(5) store i32 0, i32 addrspace(5)* %alloca ret void @@ -94,6 +112,33 @@ ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, i32 addrspace(4)* @gv, align 4 %alloca = alloca i32, i32 %n, addrspace(5) store i32 0, i32 addrspace(5)* %alloca @@ -134,6 +179,23 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s32, 16 +; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 16, addrspace(5) store i32 0, i32 addrspace(5)* %alloca ret void @@ -190,6 +252,33 @@ ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, i32 addrspace(4)* @gv, align 16 %alloca = alloca i32, i32 %n, addrspace(5) store i32 0, i32 addrspace(5)* %alloca @@ -232,6 +321,24 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s32, 32 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s33, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %alloca = alloca i32, i32 %n, align 32, addrspace(5) store i32 0, i32 addrspace(5)* %alloca ret void @@ -292,6 +399,35 @@ ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_add_i32 s33, s32, 31 +; GFX11-NEXT: s_add_i32 s32, s32, 64 +; GFX11-NEXT: s_and_not1_b32 s33, s33, 31 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, gv@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, gv@gotpcrel32@hi+12 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl2_add_u32 s0, s0, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, s0, -16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s32, s0 +; GFX11-NEXT: s_addk_i32 s32, 0xffc0 +; GFX11-NEXT: s_and_b32 s0, s0, 0xfffffc00 +; GFX11-NEXT: scratch_store_b32 off, v0, s0 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %n = load i32, i32 addrspace(4)* @gv %alloca = alloca i32, i32 %n, align 32, addrspace(5) store i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(<4 x i128> addrspace(4)* inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx: @@ -22,6 +23,15 @@ ; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9] ; GFX10-NEXT: s_movrels_b64 s[2:3], s[10:11] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[8:23], s[2:3], 0x0 +; GFX11-NEXT: s_lshl_b32 m0, s4, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX11-NEXT: s_movrels_b64 s[2:3], s[10:11] +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -117,6 +127,28 @@ ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off +; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 +; GFX11-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32 +; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48 +; GFX11-NEXT: s_lshl_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 m0, s0, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_movrels_b32_e32 v0, v2 +; GFX11-NEXT: v_movrels_b32_e32 v1, v3 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -360,6 +392,66 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v17, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, v18, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 +; GFX11-NEXT: global_load_b128 v[11:14], v[0:1], off offset:32 +; GFX11-NEXT: global_load_b128 v[15:18], v[0:1], off offset:48 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v3, v5 :: v_dual_cndmask_b32 v21, v4, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v21, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v20, v7 :: v_dual_add_nc_u32 v19, 1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v9 :: v_dual_cndmask_b32 v1, v1, v10 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v19 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v11 :: v_dual_cndmask_b32 v1, v1, v12 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v13 :: v_dual_cndmask_b32 v1, v1, v14 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v19 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v15 :: v_dual_cndmask_b32 v1, v1, v16 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v17 :: v_dual_cndmask_b32 v1, v1, v18 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v14, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v19 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v16, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v19 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v17, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, v18, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -631,6 +723,69 @@ ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_add_nc_u32 v1, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx ret i128 %element @@ -648,6 +803,12 @@ ; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i128_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 0 ret i128 %element @@ -673,6 +834,16 @@ ; GFX10-NEXT: s_mov_b32 s2, s6 ; GFX10-NEXT: s_mov_b32 s3, s7 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i128_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: s_mov_b32 s2, s6 +; GFX11-NEXT: s_mov_b32 s3, s7 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 1 ret i128 %element @@ -698,6 +869,16 @@ ; GFX10-NEXT: s_mov_b32 s2, s10 ; GFX10-NEXT: s_mov_b32 s3, s11 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i128_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s8 +; GFX11-NEXT: s_mov_b32 s1, s9 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 2 ret i128 %element @@ -723,6 +904,16 @@ ; GFX10-NEXT: s_mov_b32 s2, s14 ; GFX10-NEXT: s_mov_b32 s3, s15 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i128_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s12 +; GFX11-NEXT: s_mov_b32 s1, s13 +; GFX11-NEXT: s_mov_b32 s2, s14 +; GFX11-NEXT: s_mov_b32 s3, s15 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 3 ret i128 %element @@ -760,6 +951,14 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i128_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 0 ret i128 %element @@ -815,6 +1014,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v6 ; GFX10-NEXT: v_mov_b32_e32 v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i128_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 1 ret i128 %element @@ -870,6 +1079,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v10 ; GFX10-NEXT: v_mov_b32_e32 v3, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i128_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 2 ret i128 %element @@ -925,6 +1144,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v14 ; GFX10-NEXT: v_mov_b32_e32 v3, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i128_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v14 :: v_dual_mov_b32 v3, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 3 ret i128 %element diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i16 @extractelement_sgpr_v4i16_sgpr_idx(<4 x i16> addrspace(4)* inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i16_sgpr_idx: @@ -28,6 +29,20 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i16_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s2, s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cselect_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s4, 1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -85,6 +100,21 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_vgpr_v4i16_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -143,6 +173,21 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i16_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -177,6 +222,22 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i16_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 %idx ret i16 %element @@ -194,6 +255,12 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i16_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 0 ret i16 %element @@ -213,6 +280,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i16_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 1 ret i16 %element @@ -232,6 +306,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i16_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 2 ret i16 %element @@ -251,6 +332,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i16_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 3 ret i16 %element @@ -285,6 +373,14 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i16_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 0 ret i16 %element @@ -323,6 +419,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i16_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 1 ret i16 %element @@ -361,6 +466,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i16_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 2 ret i16 %element @@ -399,6 +513,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i16_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 3 ret i16 %element @@ -436,6 +559,24 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s5, s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_eq_u32 s5, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cselect_b32 s0, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 2 +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 3 +; GFX11-NEXT: s_cselect_b32 s0, s3, s0 +; GFX11-NEXT: s_and_b32 s1, s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -512,6 +653,26 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_vgpr_v8i16_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 +; GFX11-NEXT: s_and_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -589,6 +750,26 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -633,6 +814,28 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 %idx ret i16 %element @@ -650,6 +853,12 @@ ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 0 ret i16 %element @@ -669,6 +878,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 1 ret i16 %element @@ -688,6 +904,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 2 ret i16 %element @@ -707,6 +930,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 3 ret i16 %element @@ -726,6 +956,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 4 ret i16 %element @@ -745,6 +982,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s2, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 5 ret i16 %element @@ -764,6 +1008,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s0, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, s3 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 6 ret i16 %element @@ -783,6 +1034,13 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s0, s3, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i16_idx7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s0, s3, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 7 ret i16 %element @@ -820,6 +1078,14 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 0 ret i16 %element @@ -861,6 +1127,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 1 ret i16 %element @@ -902,6 +1177,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 2 ret i16 %element @@ -943,6 +1227,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 3 ret i16 %element @@ -984,6 +1277,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 4 ret i16 %element @@ -1025,6 +1327,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 5 ret i16 %element @@ -1066,6 +1377,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 6 ret i16 %element @@ -1107,6 +1427,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i16_idx7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 7 ret i16 %element diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i8 @extractelement_sgpr_v4i8_sgpr_idx(<4 x i8> addrspace(4)* inreg %ptr, i32 inreg %idx) { ; GCN-LABEL: extractelement_sgpr_v4i8_sgpr_idx: @@ -42,6 +43,26 @@ ; GFX10-NEXT: s_lshl_b32 s1, s2, 3 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i8_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-NEXT: s_and_b32 s2, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: s_and_b32 s2, s4, 3 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -126,6 +147,28 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_vgpr_v4i8_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -211,6 +254,29 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i8_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -296,6 +362,29 @@ ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i8_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 %idx ret i8 %element @@ -333,6 +422,23 @@ ; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i8_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 0 ret i8 %element @@ -372,6 +478,24 @@ ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i8_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 1 ret i8 %element @@ -411,6 +535,24 @@ ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i8_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 2 ret i8 %element @@ -450,6 +592,24 @@ ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v4i8_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: ; return to shader part epilog %vector = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %element = extractelement <4 x i8> %vector, i32 3 ret i8 %element @@ -523,6 +683,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i8_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 0 ret i8 %element @@ -600,6 +779,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i8_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 1 ret i8 %element @@ -677,6 +876,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i8_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 2 ret i8 %element @@ -754,6 +973,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v4i8_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %element = extractelement <4 x i8> %vector, i32 3 ret i8 %element @@ -823,6 +1062,39 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s2, s4, 2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_and_b32 s6, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_and_b32 s8, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 +; GFX11-NEXT: s_lshl_b32 s3, s3, 24 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_or_b32 s1, s7, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_or_b32 s1, s1, s5 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s0, s1, s0 +; GFX11-NEXT: s_and_b32 s1, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s1, s1, 3 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -948,6 +1220,40 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_vgpr_v8i8_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s0, s2, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_or3_b32 v1, v1, v7, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v3 +; GFX11-NEXT: v_or3_b32 v0, v0, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -1074,6 +1380,38 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v6, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v4 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v1, v1, v8, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v5, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_lshlrev_b32 v1, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -1148,6 +1486,43 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-NEXT: s_and_b32 s6, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 24 +; GFX11-NEXT: s_or_b32 s1, s6, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_lshl_b32 s2, s2, 24 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_cndmask_b32 v1, s0, v2 :: v_dual_and_b32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 %idx ret i8 %element @@ -1185,6 +1560,23 @@ ; GFX10-NEXT: s_lshl_b32 s0, s0, 24 ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 0 ret i8 %element @@ -1224,6 +1616,24 @@ ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 1 ret i8 %element @@ -1263,6 +1673,24 @@ ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 2 ret i8 %element @@ -1302,6 +1730,24 @@ ; GFX10-NEXT: s_or_b32 s0, s1, s0 ; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX11-NEXT: s_and_b32 s1, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 3 ret i8 %element @@ -1339,6 +1785,23 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX11-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 4 ret i8 %element @@ -1378,6 +1841,24 @@ ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 8 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX11-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 8 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 5 ret i8 %element @@ -1417,6 +1898,24 @@ ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX11-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 6 ret i8 %element @@ -1456,6 +1955,24 @@ ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v8i8_idx7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s2, s1, 0x80008 +; GFX11-NEXT: s_and_b32 s0, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: ; return to shader part epilog %vector = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %element = extractelement <8 x i8> %vector, i32 7 ret i8 %element @@ -1529,6 +2046,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 0 ret i8 %element @@ -1606,6 +2142,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 1 ret i8 %element @@ -1683,6 +2239,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 2 ret i8 %element @@ -1760,6 +2336,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 3 ret i8 %element @@ -1833,6 +2429,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 4 ret i8 %element @@ -1910,6 +2525,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 5 ret i8 %element @@ -1987,6 +2622,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 6 ret i8 %element @@ -2064,6 +2719,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v8i8_idx7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %element = extractelement <8 x i8> %vector, i32 7 ret i8 %element @@ -2181,6 +2856,63 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v16i8_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-NEXT: s_and_b32 s9, s0, 0xff +; GFX11-NEXT: s_and_b32 s11, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshl_b32 s6, s6, 24 +; GFX11-NEXT: s_or_b32 s1, s10, s1 +; GFX11-NEXT: s_lshr_b32 s7, s2, 24 +; GFX11-NEXT: s_and_b32 s13, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_or_b32 s0, s9, s0 +; GFX11-NEXT: s_or_b32 s1, s1, s6 +; GFX11-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX11-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s11, s13, s14 +; GFX11-NEXT: s_or_b32 s0, s0, s5 +; GFX11-NEXT: s_lshl_b32 s5, s7, 24 +; GFX11-NEXT: s_and_b32 s7, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s2, s11, s2 +; GFX11-NEXT: s_or_b32 s6, s7, s6 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s5 +; GFX11-NEXT: s_or_b32 s3, s6, s3 +; GFX11-NEXT: s_lshl_b32 s5, s8, 24 +; GFX11-NEXT: s_lshr_b32 s6, s4, 2 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: s_cselect_b32 s0, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b32 s0, s3, s0 +; GFX11-NEXT: s_and_b32 s1, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s1, s1, 3 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2387,6 +3119,61 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_vgpr_v16i8_sgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s0, s2, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GFX11-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v8 +; GFX11-NEXT: v_or3_b32 v2, v2, v13, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v10 +; GFX11-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX11-NEXT: v_or3_b32 v0, v0, v9, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or3_b32 v1, v1, v11, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v1, v3, v4, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2594,6 +3381,59 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 2, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX11-NEXT: v_bfe_u32 v15, v5, 16, 8 +; GFX11-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v5, v14 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 8 +; GFX11-NEXT: v_bfe_u32 v13, v4, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX11-NEXT: v_or3_b32 v5, v5, v15, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_and_or_b32 v3, v3, 0xff, v10 +; GFX11-NEXT: v_and_or_b32 v4, v4, 0xff, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v16 +; GFX11-NEXT: v_or3_b32 v1, v3, v11, v1 +; GFX11-NEXT: v_or3_b32 v3, v4, v13, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; GFX11-NEXT: v_and_or_b32 v6, 0xff, v6, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: v_or3_b32 v3, v6, v12, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2718,6 +3558,67 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: extractelement_sgpr_v16i8_vgpr_idx: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_and_b32 s8, s0, 0xff +; GFX11-NEXT: s_and_b32 s10, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_or_b32 s1, s9, s1 +; GFX11-NEXT: s_lshr_b32 s4, s0, 24 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_or_b32 s1, s1, s5 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_bfe_u32 s13, s2, 0x80008 +; GFX11-NEXT: v_mov_b32_e32 v2, s1 +; GFX11-NEXT: s_lshl_b32 s4, s4, 24 +; GFX11-NEXT: s_or_b32 s0, s8, s0 +; GFX11-NEXT: s_lshr_b32 s6, s2, 24 +; GFX11-NEXT: s_and_b32 s12, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s10, s12, s13 +; GFX11-NEXT: s_bfe_u32 s5, s3, 0x80008 +; GFX11-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; GFX11-NEXT: s_or_b32 s2, s10, s2 +; GFX11-NEXT: s_lshl_b32 s4, s6, 24 +; GFX11-NEXT: s_and_b32 s6, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s2, s2, s4 +; GFX11-NEXT: s_lshr_b32 s7, s3, 24 +; GFX11-NEXT: s_or_b32 s3, s6, s5 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; GFX11-NEXT: s_or_b32 s0, s3, s1 +; GFX11-NEXT: s_lshl_b32 s1, s7, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s3, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %vector = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %element = extractelement <16 x i8> %vector, i32 %idx ret i8 %element @@ -2791,6 +3692,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 0 ret i8 %element @@ -2868,6 +3788,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 1 ret i8 %element @@ -2945,6 +3885,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 2 ret i8 %element @@ -3022,6 +3982,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 3 ret i8 %element @@ -3095,6 +4075,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 4 ret i8 %element @@ -3172,6 +4171,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 5 ret i8 %element @@ -3249,6 +4268,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 6 ret i8 %element @@ -3326,6 +4365,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 7 ret i8 %element @@ -3399,6 +4458,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 8 ret i8 %element @@ -3476,6 +4554,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx9: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 9 ret i8 %element @@ -3553,6 +4651,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx10: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 10 ret i8 %element @@ -3630,6 +4748,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx11: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v2, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 11 ret i8 %element @@ -3703,6 +4841,25 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx12: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 12 ret i8 %element @@ -3780,6 +4937,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx13: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 13 ret i8 %element @@ -3857,6 +5034,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx14: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 14 ret i8 %element @@ -3934,6 +5131,26 @@ ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: extractelement_vgpr_v16i8_idx15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v0, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v3, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vector = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %element = extractelement <16 x i8> %vector, i32 15 ret i8 %element diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GPRIDX %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define float @dyn_extract_v8f32_const_s_v(i32 %sel) { ; GCN-LABEL: dyn_extract_v8f32_const_s_v: @@ -28,25 +29,25 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v8f32_const_s_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, 0x41000000, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v8f32_const_s_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, 0x41000000, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> , i32 %sel ret float %ext @@ -87,20 +88,20 @@ ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f32_const_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: s_mov_b32 s11, 0x41000000 -; GFX10-NEXT: s_mov_b32 s10, 0x40e00000 -; GFX10-NEXT: s_mov_b32 s9, 0x40c00000 -; GFX10-NEXT: s_mov_b32 s8, 0x40a00000 -; GFX10-NEXT: s_mov_b32 s7, 4.0 -; GFX10-NEXT: s_mov_b32 s6, 0x40400000 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: s_movrels_b32 s0, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f32_const_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s4, 1.0 +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s11, 0x41000000 +; GFX10PLUS-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX10PLUS-NEXT: s_mov_b32 s9, 0x40c00000 +; GFX10PLUS-NEXT: s_mov_b32 s8, 0x40a00000 +; GFX10PLUS-NEXT: s_mov_b32 s7, 4.0 +; GFX10PLUS-NEXT: s_mov_b32 s6, 0x40400000 +; GFX10PLUS-NEXT: s_mov_b32 s5, 2.0 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s4 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> , i32 %sel ret float %ext @@ -137,32 +138,32 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v8, vcc ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f32_s_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s7, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f32_s_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s1 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s7, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -188,25 +189,25 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v8f32_v_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v8f32_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -237,11 +238,11 @@ ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f32_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: v_movrels_b32_e32 v0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f32_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -282,20 +283,20 @@ ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f32_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 m0, s10 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_movrels_b32 s0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f32_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 m0, s10 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <8 x float> %vec, i32 %sel ret float %ext @@ -388,6 +389,42 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s18, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s19, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v8i64_const_s_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b64 s[2:3], 2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: s_mov_b64 s[0:1], 1 +; GFX11-NEXT: s_mov_b64 s[4:5], 3 +; GFX11-NEXT: s_mov_b64 s[6:7], 4 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: s_mov_b64 s[8:9], 5 +; GFX11-NEXT: s_mov_b64 s[10:11], 6 +; GFX11-NEXT: s_mov_b64 s[12:13], 7 +; GFX11-NEXT: s_mov_b64 s[14:15], 8 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i64> , i32 %sel ret i64 %ext @@ -444,6 +481,23 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v8i64_const_s_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b64 s[4:5], 1 +; GFX11-NEXT: s_mov_b32 m0, s2 +; GFX11-NEXT: s_mov_b64 s[18:19], 8 +; GFX11-NEXT: s_mov_b64 s[16:17], 7 +; GFX11-NEXT: s_mov_b64 s[14:15], 6 +; GFX11-NEXT: s_mov_b64 s[12:13], 5 +; GFX11-NEXT: s_mov_b64 s[10:11], 4 +; GFX11-NEXT: s_mov_b64 s[8:9], 3 +; GFX11-NEXT: s_mov_b64 s[6:7], 2 +; GFX11-NEXT: s_movrels_b64 s[0:1], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> , i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -602,6 +656,50 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v8i64_s_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s19, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s19 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -661,6 +759,26 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v8i64_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i64> %vec, i32 %sel ret i64 %ext @@ -692,6 +810,15 @@ ; GFX10-NEXT: v_movrels_b32_e32 v17, v1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[16:17], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v8i64_v_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_lshl_b32 m0, s2, 1 +; GFX11-NEXT: v_movrels_b32_e32 v16, v0 +; GFX11-NEXT: v_movrels_b32_e32 v17, v1 +; GFX11-NEXT: global_store_b64 v[0:1], v[16:17], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -773,6 +900,31 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v8i64_s_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 m0, s18 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i64> %vec, i32 %sel store i64 %ext, i64 addrspace(1)* undef @@ -815,20 +967,20 @@ ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f32_s_s_offset3: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 m0, s10 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_movrels_b32 s0, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f32_s_s_offset3: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 m0, s10 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s3 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <8 x float> %vec, i32 %add @@ -876,26 +1028,26 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v8f32_v_v_offset3: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 3, v8 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v8f32_v_v_offset3: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <8 x float> %vec, i32 %add @@ -925,27 +1077,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset1: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[2:3] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset1: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[2:3] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 1 %ext = extractelement <8 x double> %vec, i32 %add @@ -975,27 +1127,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[4:5] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset2: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[4:5] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset2: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[4:5] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 2 %ext = extractelement <8 x double> %vec, i32 %add @@ -1025,27 +1177,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[6:7] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset3: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[6:7] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset3: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[6:7] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <8 x double> %vec, i32 %add @@ -1075,27 +1227,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset4: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset4: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[8:9] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 4 %ext = extractelement <8 x double> %vec, i32 %add @@ -1125,27 +1277,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[10:11] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset5: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[10:11] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset5: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[10:11] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 5 %ext = extractelement <8 x double> %vec, i32 %add @@ -1175,27 +1327,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[12:13] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset6: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[12:13] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset6: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[12:13] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 6 %ext = extractelement <8 x double> %vec, i32 %add @@ -1248,27 +1400,27 @@ ; MOVREL-NEXT: s_movrels_b64 s[0:1], s[14:15] ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offset7: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_mov_b32 m0, s18 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[14:15] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offset7: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_mov_b32 m0, s18 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[14:15] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 7 %ext = extractelement <8 x double> %vec, i32 %add @@ -1298,27 +1450,27 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v8f64_s_s_offsetm1: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_add_i32 m0, s18, -1 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_mov_b32 s15, s17 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v8f64_s_s_offsetm1: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_add_i32 m0, s18, -1 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_mov_b32 s15, s17 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, -1 %ext = extractelement <8 x double> %vec, i32 %add @@ -1407,6 +1559,27 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v8f64_v_v_offset3: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v7 :: v_dual_cndmask_b32 v0, v0, v6 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v11 :: v_dual_cndmask_b32 v0, v0, v10 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v13 :: v_dual_cndmask_b32 v0, v0, v12 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v15 :: v_dual_cndmask_b32 v0, v0, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <8 x double> %vec, i32 %add @@ -1433,25 +1606,25 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v8p3_v_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v8p3_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx ret i8 addrspace(3)* %ext @@ -1510,6 +1683,22 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ds_write_b32 v0, v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v8p3_s_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 m0, s10 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_movrels_b32 s0, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: ds_store_b32 v0, v0 +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i8 addrspace(3)*> %vec, i32 %idx store i8 addrspace(3)* %ext, i8 addrspace(3)* addrspace(3)* undef @@ -1569,6 +1758,26 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v8p1_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v14 :: v_dual_cndmask_b32 v1, v1, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <8 x i8 addrspace(1)*> %vec, i32 %idx ret i8 addrspace(1)* %ext @@ -1649,6 +1858,31 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v8p1_s_s: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 m0, s18 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s3, s5 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: s_mov_b32 s14, s16 +; GFX11-NEXT: s_mov_b32 s15, s17 +; GFX11-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <8 x i8 addrspace(1)*> %vec, i32 %idx store i8 addrspace(1)* %ext, i8 addrspace(1)* addrspace(1)* undef @@ -1669,11 +1903,11 @@ ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v16f32_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: v_movrels_b32_e32 v0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v16f32_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x float> %vec, i32 %sel ret float %ext @@ -1693,11 +1927,11 @@ ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v32f32_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: v_movrels_b32_e32 v0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v32f32_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <32 x float> %vec, i32 %sel ret float %ext @@ -1724,14 +1958,14 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v16f64_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_lshl_b32 m0, s2, 1 -; GFX10-NEXT: v_movrels_b32_e32 v32, v0 -; GFX10-NEXT: v_movrels_b32_e32 v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v32 -; GFX10-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v16f64_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v32, v0 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v1 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v32 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x double> %vec, i32 %sel ret double %ext @@ -1761,28 +1995,28 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v16f32_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: s_mov_b32 s19, 0x41800000 -; GFX10-NEXT: s_mov_b32 s18, 0x41700000 -; GFX10-NEXT: s_mov_b32 s17, 0x41600000 -; GFX10-NEXT: s_mov_b32 s16, 0x41500000 -; GFX10-NEXT: s_mov_b32 s15, 0x41400000 -; GFX10-NEXT: s_mov_b32 s14, 0x41300000 -; GFX10-NEXT: s_mov_b32 s13, 0x41200000 -; GFX10-NEXT: s_mov_b32 s12, 0x41100000 -; GFX10-NEXT: s_mov_b32 s11, 0x41000000 -; GFX10-NEXT: s_mov_b32 s10, 0x40e00000 -; GFX10-NEXT: s_mov_b32 s9, 0x40c00000 -; GFX10-NEXT: s_mov_b32 s8, 0x40a00000 -; GFX10-NEXT: s_mov_b32 s7, 4.0 -; GFX10-NEXT: s_mov_b32 s6, 0x40400000 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: s_movrels_b32 s0, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v16f32_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s4, 1.0 +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s19, 0x41800000 +; GFX10PLUS-NEXT: s_mov_b32 s18, 0x41700000 +; GFX10PLUS-NEXT: s_mov_b32 s17, 0x41600000 +; GFX10PLUS-NEXT: s_mov_b32 s16, 0x41500000 +; GFX10PLUS-NEXT: s_mov_b32 s15, 0x41400000 +; GFX10PLUS-NEXT: s_mov_b32 s14, 0x41300000 +; GFX10PLUS-NEXT: s_mov_b32 s13, 0x41200000 +; GFX10PLUS-NEXT: s_mov_b32 s12, 0x41100000 +; GFX10PLUS-NEXT: s_mov_b32 s11, 0x41000000 +; GFX10PLUS-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX10PLUS-NEXT: s_mov_b32 s9, 0x40c00000 +; GFX10PLUS-NEXT: s_mov_b32 s8, 0x40a00000 +; GFX10PLUS-NEXT: s_mov_b32 s7, 4.0 +; GFX10PLUS-NEXT: s_mov_b32 s6, 0x40400000 +; GFX10PLUS-NEXT: s_mov_b32 s5, 2.0 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s4 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x float> , i32 %sel ret float %ext @@ -1828,44 +2062,44 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v32f32_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s36, 1.0 -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: s_mov_b32 s67, 0x42000000 -; GFX10-NEXT: s_mov_b32 s66, 0x41f80000 -; GFX10-NEXT: s_mov_b32 s65, 0x41f00000 -; GFX10-NEXT: s_mov_b32 s64, 0x41e80000 -; GFX10-NEXT: s_mov_b32 s63, 0x41e00000 -; GFX10-NEXT: s_mov_b32 s62, 0x41d80000 -; GFX10-NEXT: s_mov_b32 s61, 0x41d00000 -; GFX10-NEXT: s_mov_b32 s60, 0x41c80000 -; GFX10-NEXT: s_mov_b32 s59, 0x41c00000 -; GFX10-NEXT: s_mov_b32 s58, 0x41b80000 -; GFX10-NEXT: s_mov_b32 s57, 0x41b00000 -; GFX10-NEXT: s_mov_b32 s56, 0x41a80000 -; GFX10-NEXT: s_mov_b32 s55, 0x41a00000 -; GFX10-NEXT: s_mov_b32 s54, 0x41980000 -; GFX10-NEXT: s_mov_b32 s53, 0x41900000 -; GFX10-NEXT: s_mov_b32 s52, 0x41880000 -; GFX10-NEXT: s_mov_b32 s51, 0x41800000 -; GFX10-NEXT: s_mov_b32 s50, 0x41700000 -; GFX10-NEXT: s_mov_b32 s49, 0x41600000 -; GFX10-NEXT: s_mov_b32 s48, 0x41500000 -; GFX10-NEXT: s_mov_b32 s47, 0x41400000 -; GFX10-NEXT: s_mov_b32 s46, 0x41300000 -; GFX10-NEXT: s_mov_b32 s45, 0x41200000 -; GFX10-NEXT: s_mov_b32 s44, 0x41100000 -; GFX10-NEXT: s_mov_b32 s43, 0x41000000 -; GFX10-NEXT: s_mov_b32 s42, 0x40e00000 -; GFX10-NEXT: s_mov_b32 s41, 0x40c00000 -; GFX10-NEXT: s_mov_b32 s40, 0x40a00000 -; GFX10-NEXT: s_mov_b32 s39, 4.0 -; GFX10-NEXT: s_mov_b32 s38, 0x40400000 -; GFX10-NEXT: s_mov_b32 s37, 2.0 -; GFX10-NEXT: s_movrels_b32 s0, s36 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v32f32_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s36, 1.0 +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s67, 0x42000000 +; GFX10PLUS-NEXT: s_mov_b32 s66, 0x41f80000 +; GFX10PLUS-NEXT: s_mov_b32 s65, 0x41f00000 +; GFX10PLUS-NEXT: s_mov_b32 s64, 0x41e80000 +; GFX10PLUS-NEXT: s_mov_b32 s63, 0x41e00000 +; GFX10PLUS-NEXT: s_mov_b32 s62, 0x41d80000 +; GFX10PLUS-NEXT: s_mov_b32 s61, 0x41d00000 +; GFX10PLUS-NEXT: s_mov_b32 s60, 0x41c80000 +; GFX10PLUS-NEXT: s_mov_b32 s59, 0x41c00000 +; GFX10PLUS-NEXT: s_mov_b32 s58, 0x41b80000 +; GFX10PLUS-NEXT: s_mov_b32 s57, 0x41b00000 +; GFX10PLUS-NEXT: s_mov_b32 s56, 0x41a80000 +; GFX10PLUS-NEXT: s_mov_b32 s55, 0x41a00000 +; GFX10PLUS-NEXT: s_mov_b32 s54, 0x41980000 +; GFX10PLUS-NEXT: s_mov_b32 s53, 0x41900000 +; GFX10PLUS-NEXT: s_mov_b32 s52, 0x41880000 +; GFX10PLUS-NEXT: s_mov_b32 s51, 0x41800000 +; GFX10PLUS-NEXT: s_mov_b32 s50, 0x41700000 +; GFX10PLUS-NEXT: s_mov_b32 s49, 0x41600000 +; GFX10PLUS-NEXT: s_mov_b32 s48, 0x41500000 +; GFX10PLUS-NEXT: s_mov_b32 s47, 0x41400000 +; GFX10PLUS-NEXT: s_mov_b32 s46, 0x41300000 +; GFX10PLUS-NEXT: s_mov_b32 s45, 0x41200000 +; GFX10PLUS-NEXT: s_mov_b32 s44, 0x41100000 +; GFX10PLUS-NEXT: s_mov_b32 s43, 0x41000000 +; GFX10PLUS-NEXT: s_mov_b32 s42, 0x40e00000 +; GFX10PLUS-NEXT: s_mov_b32 s41, 0x40c00000 +; GFX10PLUS-NEXT: s_mov_b32 s40, 0x40a00000 +; GFX10PLUS-NEXT: s_mov_b32 s39, 4.0 +; GFX10PLUS-NEXT: s_mov_b32 s38, 0x40400000 +; GFX10PLUS-NEXT: s_mov_b32 s37, 2.0 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s36 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <32 x float> , i32 %sel ret float %ext @@ -1907,40 +2141,40 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[36:37] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v16f64_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s66, 0 -; GFX10-NEXT: s_mov_b64 s[36:37], 1.0 -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: s_mov_b32 s67, 0x40300000 -; GFX10-NEXT: s_mov_b32 s65, 0x402e0000 -; GFX10-NEXT: s_mov_b32 s64, s66 -; GFX10-NEXT: s_mov_b32 s63, 0x402c0000 -; GFX10-NEXT: s_mov_b32 s62, s66 -; GFX10-NEXT: s_mov_b32 s61, 0x402a0000 -; GFX10-NEXT: s_mov_b32 s60, s66 -; GFX10-NEXT: s_mov_b32 s59, 0x40280000 -; GFX10-NEXT: s_mov_b32 s58, s66 -; GFX10-NEXT: s_mov_b32 s57, 0x40260000 -; GFX10-NEXT: s_mov_b32 s56, s66 -; GFX10-NEXT: s_mov_b32 s55, 0x40240000 -; GFX10-NEXT: s_mov_b32 s54, s66 -; GFX10-NEXT: s_mov_b32 s53, 0x40220000 -; GFX10-NEXT: s_mov_b32 s52, s66 -; GFX10-NEXT: s_mov_b32 s51, 0x40200000 -; GFX10-NEXT: s_mov_b32 s50, s66 -; GFX10-NEXT: s_mov_b32 s49, 0x401c0000 -; GFX10-NEXT: s_mov_b32 s48, s66 -; GFX10-NEXT: s_mov_b32 s47, 0x40180000 -; GFX10-NEXT: s_mov_b32 s46, s66 -; GFX10-NEXT: s_mov_b32 s45, 0x40140000 -; GFX10-NEXT: s_mov_b32 s44, s66 -; GFX10-NEXT: s_mov_b64 s[42:43], 4.0 -; GFX10-NEXT: s_mov_b32 s41, 0x40080000 -; GFX10-NEXT: s_mov_b32 s40, s66 -; GFX10-NEXT: s_mov_b64 s[38:39], 2.0 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[36:37] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s66, 0 +; GFX10PLUS-NEXT: s_mov_b64 s[36:37], 1.0 +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s67, 0x40300000 +; GFX10PLUS-NEXT: s_mov_b32 s65, 0x402e0000 +; GFX10PLUS-NEXT: s_mov_b32 s64, s66 +; GFX10PLUS-NEXT: s_mov_b32 s63, 0x402c0000 +; GFX10PLUS-NEXT: s_mov_b32 s62, s66 +; GFX10PLUS-NEXT: s_mov_b32 s61, 0x402a0000 +; GFX10PLUS-NEXT: s_mov_b32 s60, s66 +; GFX10PLUS-NEXT: s_mov_b32 s59, 0x40280000 +; GFX10PLUS-NEXT: s_mov_b32 s58, s66 +; GFX10PLUS-NEXT: s_mov_b32 s57, 0x40260000 +; GFX10PLUS-NEXT: s_mov_b32 s56, s66 +; GFX10PLUS-NEXT: s_mov_b32 s55, 0x40240000 +; GFX10PLUS-NEXT: s_mov_b32 s54, s66 +; GFX10PLUS-NEXT: s_mov_b32 s53, 0x40220000 +; GFX10PLUS-NEXT: s_mov_b32 s52, s66 +; GFX10PLUS-NEXT: s_mov_b32 s51, 0x40200000 +; GFX10PLUS-NEXT: s_mov_b32 s50, s66 +; GFX10PLUS-NEXT: s_mov_b32 s49, 0x401c0000 +; GFX10PLUS-NEXT: s_mov_b32 s48, s66 +; GFX10PLUS-NEXT: s_mov_b32 s47, 0x40180000 +; GFX10PLUS-NEXT: s_mov_b32 s46, s66 +; GFX10PLUS-NEXT: s_mov_b32 s45, 0x40140000 +; GFX10PLUS-NEXT: s_mov_b32 s44, s66 +; GFX10PLUS-NEXT: s_mov_b64 s[42:43], 4.0 +; GFX10PLUS-NEXT: s_mov_b32 s41, 0x40080000 +; GFX10PLUS-NEXT: s_mov_b32 s40, s66 +; GFX10PLUS-NEXT: s_mov_b64 s[38:39], 2.0 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[36:37] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <16 x double> , i32 %sel ret double %ext @@ -1969,26 +2203,26 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v6f32_s_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s5, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v6f32_s_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s1 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s5, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -2010,21 +2244,21 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v6f32_v_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v6f32_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -2045,19 +2279,19 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v6f32_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v6f32_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -2079,20 +2313,20 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v6f32_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_cmp_eq_u32 s8, 1 -; GFX10-NEXT: s_cselect_b32 s0, s3, s2 -; GFX10-NEXT: s_cmp_eq_u32 s8, 2 -; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 3 -; GFX10-NEXT: s_cselect_b32 s0, s5, s0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 4 -; GFX10-NEXT: s_cselect_b32 s0, s6, s0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 5 -; GFX10-NEXT: s_cselect_b32 s0, s7, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v6f32_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_cmp_eq_u32 s8, 1 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s8, 2 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s8, 3 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s8, 4 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s8, 5 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x float> %vec, i32 %sel ret float %ext @@ -2125,29 +2359,29 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v7, vcc ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v7f32_s_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s6, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v7f32_s_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s1 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s6, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -2171,23 +2405,23 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v7f32_v_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v7f32_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v7 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v7 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v7 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -2210,21 +2444,21 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v7f32_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v7f32_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -2248,22 +2482,22 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v7f32_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_cmp_eq_u32 s9, 1 -; GFX10-NEXT: s_cselect_b32 s0, s3, s2 -; GFX10-NEXT: s_cmp_eq_u32 s9, 2 -; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s9, 3 -; GFX10-NEXT: s_cselect_b32 s0, s5, s0 -; GFX10-NEXT: s_cmp_eq_u32 s9, 4 -; GFX10-NEXT: s_cselect_b32 s0, s6, s0 -; GFX10-NEXT: s_cmp_eq_u32 s9, 5 -; GFX10-NEXT: s_cselect_b32 s0, s7, s0 -; GFX10-NEXT: s_cmp_eq_u32 s9, 6 -; GFX10-NEXT: s_cselect_b32 s0, s8, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v7f32_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_cmp_eq_u32 s9, 1 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s3, s2 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s9, 2 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s4, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s9, 3 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s5, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s9, 4 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s6, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s9, 5 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s7, s0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s9, 6 +; GFX10PLUS-NEXT: s_cselect_b32 s0, s8, s0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x float> %vec, i32 %sel ret float %ext @@ -2345,6 +2579,40 @@ ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_extract_v6f64_s_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s15, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s15 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s11, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -2391,6 +2659,22 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v6f64_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v12 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -2417,14 +2701,14 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v6f64_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_lshl_b32 m0, s2, 1 -; GFX10-NEXT: v_movrels_b32_e32 v12, v0 -; GFX10-NEXT: v_movrels_b32_e32 v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v12 -; GFX10-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v6f64_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v12, v0 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v1 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v12 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -2449,23 +2733,23 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v6f64_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 m0, s14 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v6f64_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 m0, s14 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <6 x double> %vec, i32 %sel ret double %ext @@ -2559,6 +2843,45 @@ ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: dyn_extract_v7f64_s_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, s2 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_mov_b32 s19, s5 +; GFX11-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s19 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: s_mov_b32 s5, s7 +; GFX11-NEXT: s_mov_b32 s6, s8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, s1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX11-NEXT: s_mov_b32 s7, s9 +; GFX11-NEXT: s_mov_b32 s8, s10 +; GFX11-NEXT: s_mov_b32 s9, s11 +; GFX11-NEXT: s_mov_b32 s10, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX11-NEXT: s_mov_b32 s11, s13 +; GFX11-NEXT: s_mov_b32 s12, s14 +; GFX11-NEXT: s_mov_b32 s13, s15 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s9, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s12, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s13, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -2611,6 +2934,24 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: dyn_extract_v7f64_v_v: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_cndmask_b32 v1, v1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v10 :: v_dual_cndmask_b32 v1, v1, v11 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v14 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v12 :: v_dual_cndmask_b32 v1, v1, v13 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -2637,14 +2978,14 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v7f64_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_lshl_b32 m0, s2, 1 -; GFX10-NEXT: v_movrels_b32_e32 v14, v0 -; GFX10-NEXT: v_movrels_b32_e32 v0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s0, v14 -; GFX10-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v7f64_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_lshl_b32 m0, s2, 1 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v14, v0 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v1 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v14 +; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -2671,25 +3012,25 @@ ; GCN-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v7f64_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 m0, s16 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_movrels_b64 s[0:1], s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v7f64_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 m0, s16 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_movrels_b64 s[0:1], s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <7 x double> %vec, i32 %sel ret double %ext @@ -2970,6 +3311,98 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v5f64_s_s: +; GFX11: .amd_kernel_code_t +; GFX11-NEXT: amd_code_version_major = 1 +; GFX11-NEXT: amd_code_version_minor = 2 +; GFX11-NEXT: amd_machine_kind = 1 +; GFX11-NEXT: amd_machine_version_major = 11 +; GFX11-NEXT: amd_machine_version_minor = 0 +; GFX11-NEXT: amd_machine_version_stepping = 0 +; GFX11-NEXT: kernel_code_entry_byte_offset = 256 +; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 +; GFX11-NEXT: granulated_workitem_vgpr_count = 0 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 1 +; GFX11-NEXT: priority = 0 +; GFX11-NEXT: float_mode = 240 +; GFX11-NEXT: priv = 0 +; GFX11-NEXT: enable_dx10_clamp = 1 +; GFX11-NEXT: debug_mode = 0 +; GFX11-NEXT: enable_ieee_mode = 1 +; GFX11-NEXT: enable_wgp_mode = 1 +; GFX11-NEXT: enable_mem_ordered = 1 +; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GFX11-NEXT: user_sgpr_count = 15 +; GFX11-NEXT: enable_trap_handler = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX11-NEXT: enable_sgpr_workgroup_info = 0 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 +; GFX11-NEXT: enable_exception_msb = 0 +; GFX11-NEXT: granulated_lds_size = 0 +; GFX11-NEXT: enable_exception = 0 +; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX11-NEXT: enable_sgpr_queue_ptr = 0 +; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 +; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX11-NEXT: enable_sgpr_private_segment_size = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GFX11-NEXT: enable_wavefront_size32 = 1 +; GFX11-NEXT: enable_ordered_append_gds = 0 +; GFX11-NEXT: private_element_size = 1 +; GFX11-NEXT: is_ptr64 = 1 +; GFX11-NEXT: is_dynamic_callstack = 0 +; GFX11-NEXT: is_debug_enabled = 0 +; GFX11-NEXT: is_xnack_enabled = 0 +; GFX11-NEXT: workitem_private_segment_byte_size = 0 +; GFX11-NEXT: workgroup_group_segment_byte_size = 0 +; GFX11-NEXT: gds_segment_byte_size = 0 +; GFX11-NEXT: kernarg_segment_byte_size = 12 +; GFX11-NEXT: workgroup_fbarrier_count = 0 +; GFX11-NEXT: wavefront_sgpr_count = 9 +; GFX11-NEXT: workitem_vgpr_count = 3 +; GFX11-NEXT: reserved_vgpr_first = 0 +; GFX11-NEXT: reserved_vgpr_count = 0 +; GFX11-NEXT: reserved_sgpr_first = 0 +; GFX11-NEXT: reserved_sgpr_count = 0 +; GFX11-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GFX11-NEXT: debug_private_segment_buffer_sgpr = 0 +; GFX11-NEXT: kernarg_segment_alignment = 4 +; GFX11-NEXT: group_segment_alignment = 4 +; GFX11-NEXT: private_segment_alignment = 4 +; GFX11-NEXT: wavefront_size = 5 +; GFX11-NEXT: call_convention = -1 +; GFX11-NEXT: runtime_loader_kernel_symbol = 0 +; GFX11-NEXT: .end_amd_kernel_code_t +; GFX11-NEXT: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x40140000 +; GFX11-NEXT: s_mov_b32 s5, 0x40080000 +; GFX11-NEXT: s_mov_b32 s4, s2 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s8, 1 +; GFX11-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 2 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5] +; GFX11-NEXT: s_cmp_eq_u32 s8, 4 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <5 x double> , i32 %sel store double %ext, double addrspace(1)* %out @@ -3022,39 +3455,39 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v12, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v15f32_const_s_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, 0x41700000, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v15f32_const_s_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, 0x41700000, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <15 x float> , i32 %sel ret float %ext @@ -3083,27 +3516,27 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v15f32_const_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s4, 1.0 -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: s_mov_b32 s18, 0x41700000 -; GFX10-NEXT: s_mov_b32 s17, 0x41600000 -; GFX10-NEXT: s_mov_b32 s16, 0x41500000 -; GFX10-NEXT: s_mov_b32 s15, 0x41400000 -; GFX10-NEXT: s_mov_b32 s14, 0x41300000 -; GFX10-NEXT: s_mov_b32 s13, 0x41200000 -; GFX10-NEXT: s_mov_b32 s12, 0x41100000 -; GFX10-NEXT: s_mov_b32 s11, 0x41000000 -; GFX10-NEXT: s_mov_b32 s10, 0x40e00000 -; GFX10-NEXT: s_mov_b32 s9, 0x40c00000 -; GFX10-NEXT: s_mov_b32 s8, 0x40a00000 -; GFX10-NEXT: s_mov_b32 s7, 4.0 -; GFX10-NEXT: s_mov_b32 s6, 0x40400000 -; GFX10-NEXT: s_mov_b32 s5, 2.0 -; GFX10-NEXT: s_movrels_b32 s0, s4 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v15f32_const_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s4, 1.0 +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s18, 0x41700000 +; GFX10PLUS-NEXT: s_mov_b32 s17, 0x41600000 +; GFX10PLUS-NEXT: s_mov_b32 s16, 0x41500000 +; GFX10PLUS-NEXT: s_mov_b32 s15, 0x41400000 +; GFX10PLUS-NEXT: s_mov_b32 s14, 0x41300000 +; GFX10PLUS-NEXT: s_mov_b32 s13, 0x41200000 +; GFX10PLUS-NEXT: s_mov_b32 s12, 0x41100000 +; GFX10PLUS-NEXT: s_mov_b32 s11, 0x41000000 +; GFX10PLUS-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX10PLUS-NEXT: s_mov_b32 s9, 0x40c00000 +; GFX10PLUS-NEXT: s_mov_b32 s8, 0x40a00000 +; GFX10PLUS-NEXT: s_mov_b32 s7, 4.0 +; GFX10PLUS-NEXT: s_mov_b32 s6, 0x40400000 +; GFX10PLUS-NEXT: s_mov_b32 s5, 2.0 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s4 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> , i32 %sel ret float %ext @@ -3168,53 +3601,53 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v15, vcc ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v15f32_s_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v15f32_s_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s1 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s7, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s8, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s13, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0 +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -3254,39 +3687,39 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v15f32_v_v: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v15f32_v_v: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -3306,11 +3739,11 @@ ; MOVREL-NEXT: v_movrels_b32_e32 v0, v0 ; MOVREL-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v15f32_v_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 m0, s2 -; GFX10-NEXT: v_movrels_b32_e32 v0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v15f32_v_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 m0, s2 +; GFX10PLUS-NEXT: v_movrels_b32_e32 v0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -3339,27 +3772,27 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v15f32_s_s: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 m0, s17 -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_movrels_b32 s0, s0 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v15f32_s_s: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 m0, s17 +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %ext = extractelement <15 x float> %vec, i32 %sel ret float %ext @@ -3388,27 +3821,27 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: dyn_extract_v15f32_s_s_offset3: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_mov_b32 s1, s3 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: s_mov_b32 m0, s17 -; GFX10-NEXT: s_mov_b32 s0, s2 -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s4, s6 -; GFX10-NEXT: s_mov_b32 s5, s7 -; GFX10-NEXT: s_mov_b32 s6, s8 -; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: s_mov_b32 s8, s10 -; GFX10-NEXT: s_mov_b32 s9, s11 -; GFX10-NEXT: s_mov_b32 s10, s12 -; GFX10-NEXT: s_mov_b32 s11, s13 -; GFX10-NEXT: s_mov_b32 s12, s14 -; GFX10-NEXT: s_mov_b32 s13, s15 -; GFX10-NEXT: s_mov_b32 s14, s16 -; GFX10-NEXT: s_movrels_b32 s0, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: dyn_extract_v15f32_s_s_offset3: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_mov_b32 s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: s_mov_b32 m0, s17 +; GFX10PLUS-NEXT: s_mov_b32 s0, s2 +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s4, s6 +; GFX10PLUS-NEXT: s_mov_b32 s5, s7 +; GFX10PLUS-NEXT: s_mov_b32 s6, s8 +; GFX10PLUS-NEXT: s_mov_b32 s7, s9 +; GFX10PLUS-NEXT: s_mov_b32 s8, s10 +; GFX10PLUS-NEXT: s_mov_b32 s9, s11 +; GFX10PLUS-NEXT: s_mov_b32 s10, s12 +; GFX10PLUS-NEXT: s_mov_b32 s11, s13 +; GFX10PLUS-NEXT: s_mov_b32 s12, s14 +; GFX10PLUS-NEXT: s_mov_b32 s13, s15 +; GFX10PLUS-NEXT: s_mov_b32 s14, s16 +; GFX10PLUS-NEXT: s_movrels_b32 s0, s3 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog entry: %add = add i32 %sel, 3 %ext = extractelement <15 x float> %vec, i32 %add @@ -3484,40 +3917,40 @@ ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc ; MOVREL-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: dyn_extract_v15f32_v_v_offset3: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_nc_u32_e32 v15, 3, v15 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: dyn_extract_v15f32_v_v_offset3: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc_lo +; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v15 +; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: %add = add i32 %sel, 3 %ext = extractelement <15 x float> %vec, i32 %add @@ -3778,6 +4211,92 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v4f32_s_s_s: +; GFX11: .amd_kernel_code_t +; GFX11-NEXT: amd_code_version_major = 1 +; GFX11-NEXT: amd_code_version_minor = 2 +; GFX11-NEXT: amd_machine_kind = 1 +; GFX11-NEXT: amd_machine_version_major = 11 +; GFX11-NEXT: amd_machine_version_minor = 0 +; GFX11-NEXT: amd_machine_version_stepping = 0 +; GFX11-NEXT: kernel_code_entry_byte_offset = 256 +; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 +; GFX11-NEXT: granulated_workitem_vgpr_count = 0 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX11-NEXT: priority = 0 +; GFX11-NEXT: float_mode = 240 +; GFX11-NEXT: priv = 0 +; GFX11-NEXT: enable_dx10_clamp = 1 +; GFX11-NEXT: debug_mode = 0 +; GFX11-NEXT: enable_ieee_mode = 1 +; GFX11-NEXT: enable_wgp_mode = 1 +; GFX11-NEXT: enable_mem_ordered = 1 +; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GFX11-NEXT: user_sgpr_count = 15 +; GFX11-NEXT: enable_trap_handler = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX11-NEXT: enable_sgpr_workgroup_info = 0 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 +; GFX11-NEXT: enable_exception_msb = 0 +; GFX11-NEXT: granulated_lds_size = 0 +; GFX11-NEXT: enable_exception = 0 +; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX11-NEXT: enable_sgpr_queue_ptr = 0 +; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 +; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX11-NEXT: enable_sgpr_private_segment_size = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GFX11-NEXT: enable_wavefront_size32 = 1 +; GFX11-NEXT: enable_ordered_append_gds = 0 +; GFX11-NEXT: private_element_size = 1 +; GFX11-NEXT: is_ptr64 = 1 +; GFX11-NEXT: is_dynamic_callstack = 0 +; GFX11-NEXT: is_debug_enabled = 0 +; GFX11-NEXT: is_xnack_enabled = 0 +; GFX11-NEXT: workitem_private_segment_byte_size = 0 +; GFX11-NEXT: workgroup_group_segment_byte_size = 0 +; GFX11-NEXT: gds_segment_byte_size = 0 +; GFX11-NEXT: kernarg_segment_byte_size = 12 +; GFX11-NEXT: workgroup_fbarrier_count = 0 +; GFX11-NEXT: wavefront_sgpr_count = 4 +; GFX11-NEXT: workitem_vgpr_count = 2 +; GFX11-NEXT: reserved_vgpr_first = 0 +; GFX11-NEXT: reserved_vgpr_count = 0 +; GFX11-NEXT: reserved_sgpr_first = 0 +; GFX11-NEXT: reserved_sgpr_count = 0 +; GFX11-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GFX11-NEXT: debug_private_segment_buffer_sgpr = 0 +; GFX11-NEXT: kernarg_segment_alignment = 4 +; GFX11-NEXT: group_segment_alignment = 4 +; GFX11-NEXT: private_segment_alignment = 4 +; GFX11-NEXT: wavefront_size = 5 +; GFX11-NEXT: call_convention = -1 +; GFX11-NEXT: runtime_loader_kernel_symbol = 0 +; GFX11-NEXT: .end_amd_kernel_code_t +; GFX11-NEXT: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s3, 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 2 +; GFX11-NEXT: s_cselect_b32 s3, 0x40400000, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, 3 +; GFX11-NEXT: s_cselect_b32 s2, 4.0, s3 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <4 x float> , i32 %sel store float %ext, float addrspace(1)* %out @@ -4047,6 +4566,94 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: dyn_extract_v4f64_s_s_s: +; GFX11: .amd_kernel_code_t +; GFX11-NEXT: amd_code_version_major = 1 +; GFX11-NEXT: amd_code_version_minor = 2 +; GFX11-NEXT: amd_machine_kind = 1 +; GFX11-NEXT: amd_machine_version_major = 11 +; GFX11-NEXT: amd_machine_version_minor = 0 +; GFX11-NEXT: amd_machine_version_stepping = 0 +; GFX11-NEXT: kernel_code_entry_byte_offset = 256 +; GFX11-NEXT: kernel_code_prefetch_byte_size = 0 +; GFX11-NEXT: granulated_workitem_vgpr_count = 0 +; GFX11-NEXT: granulated_wavefront_sgpr_count = 0 +; GFX11-NEXT: priority = 0 +; GFX11-NEXT: float_mode = 240 +; GFX11-NEXT: priv = 0 +; GFX11-NEXT: enable_dx10_clamp = 1 +; GFX11-NEXT: debug_mode = 0 +; GFX11-NEXT: enable_ieee_mode = 1 +; GFX11-NEXT: enable_wgp_mode = 1 +; GFX11-NEXT: enable_mem_ordered = 1 +; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; GFX11-NEXT: user_sgpr_count = 15 +; GFX11-NEXT: enable_trap_handler = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1 +; GFX11-NEXT: enable_sgpr_workgroup_id_y = 0 +; GFX11-NEXT: enable_sgpr_workgroup_id_z = 0 +; GFX11-NEXT: enable_sgpr_workgroup_info = 0 +; GFX11-NEXT: enable_vgpr_workitem_id = 0 +; GFX11-NEXT: enable_exception_msb = 0 +; GFX11-NEXT: granulated_lds_size = 0 +; GFX11-NEXT: enable_exception = 0 +; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 +; GFX11-NEXT: enable_sgpr_dispatch_ptr = 0 +; GFX11-NEXT: enable_sgpr_queue_ptr = 0 +; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 +; GFX11-NEXT: enable_sgpr_dispatch_id = 0 +; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 +; GFX11-NEXT: enable_sgpr_private_segment_size = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; GFX11-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; GFX11-NEXT: enable_wavefront_size32 = 1 +; GFX11-NEXT: enable_ordered_append_gds = 0 +; GFX11-NEXT: private_element_size = 1 +; GFX11-NEXT: is_ptr64 = 1 +; GFX11-NEXT: is_dynamic_callstack = 0 +; GFX11-NEXT: is_debug_enabled = 0 +; GFX11-NEXT: is_xnack_enabled = 0 +; GFX11-NEXT: workitem_private_segment_byte_size = 0 +; GFX11-NEXT: workgroup_group_segment_byte_size = 0 +; GFX11-NEXT: gds_segment_byte_size = 0 +; GFX11-NEXT: kernarg_segment_byte_size = 12 +; GFX11-NEXT: workgroup_fbarrier_count = 0 +; GFX11-NEXT: wavefront_sgpr_count = 7 +; GFX11-NEXT: workitem_vgpr_count = 3 +; GFX11-NEXT: reserved_vgpr_first = 0 +; GFX11-NEXT: reserved_vgpr_count = 0 +; GFX11-NEXT: reserved_sgpr_first = 0 +; GFX11-NEXT: reserved_sgpr_count = 0 +; GFX11-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; GFX11-NEXT: debug_private_segment_buffer_sgpr = 0 +; GFX11-NEXT: kernarg_segment_alignment = 4 +; GFX11-NEXT: group_segment_alignment = 4 +; GFX11-NEXT: private_segment_alignment = 4 +; GFX11-NEXT: wavefront_size = 5 +; GFX11-NEXT: call_convention = -1 +; GFX11-NEXT: runtime_loader_kernel_symbol = 0 +; GFX11-NEXT: .end_amd_kernel_code_t +; GFX11-NEXT: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x40080000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %ext = extractelement <4 x double> , i32 %sel store double %ext, double addrspace(1)* %out @@ -4080,6 +4687,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_extract_v64i32_7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 7 ret i32 %elt @@ -4112,6 +4728,14 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:128 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_extract_v64i32_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 32 ret i32 %elt @@ -4147,6 +4771,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_extract_v64i32_33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 33 ret i32 %elt @@ -4179,6 +4812,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_extract_v64i32_37: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:144 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 37 ret i32 %elt diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -9,8 +9,11 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s + +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define half @v_fdiv_f16(half %a, half %b) { ; GFX6-IEEE-LABEL: v_fdiv_f16: @@ -76,6 +79,19 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b ret half %fdiv } @@ -105,6 +121,15 @@ ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f16_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn half %a, %b ret half %fdiv } @@ -173,6 +198,19 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f16_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half %a, %b, !fpmath !0 ret half %fdiv } @@ -241,6 +279,19 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half 1.0, %x ret half %fdiv } @@ -309,6 +360,19 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f16_arcp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half 1.0, %x ret half %fdiv } @@ -330,12 +394,12 @@ ; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_f16_arcp_afn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rcp_f16_e32 v0, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_rcp_f16_arcp_afn: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn half 1.0, %x ret half %fdiv } @@ -388,12 +452,12 @@ ; GFX89-NEXT: v_rcp_f16_e32 v0, v0 ; GFX89-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_rcp_f16_ulp25: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rcp_f16_e32 v0, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_rcp_f16_ulp25: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv half 1.0, %x, !fpmath !0 ret half %fdiv } @@ -423,6 +487,15 @@ ; GFX10-NEXT: v_rcp_f16_e32 v1, v1 ; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f16_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn half %a, %b, !fpmath !0 ret half %fdiv } @@ -491,6 +564,19 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f16_arcp_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp half %a, %b, !fpmath !0 ret half %fdiv } @@ -632,6 +718,28 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v4, v7, v4 :: v_dual_mul_f32 v3, v6, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v4, v1, v0 +; GFX11-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b ret <2 x half> %fdiv } @@ -685,6 +793,21 @@ ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f16_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b ret <2 x half> %fdiv } @@ -826,6 +949,28 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f16_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v4, v7, v4 :: v_dual_mul_f32 v3, v6, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v4, v1, v0 +; GFX11-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -960,6 +1105,27 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x ret <2 x half> %fdiv } @@ -1094,6 +1260,27 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f16_arcp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 +; GFX11-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> , %x ret <2 x half> %fdiv } @@ -1140,6 +1327,18 @@ ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f16_arcp_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x half> , %x ret <2 x half> %fdiv } @@ -1242,6 +1441,18 @@ ; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f16_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v0, v0 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x half> , %x, !fpmath !0 ret <2 x half> %fdiv } @@ -1295,6 +1506,21 @@ ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -1436,6 +1662,28 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v4, v7, v4 :: v_dual_mul_f32 v3, v6, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v0, v4, v1, v0 +; GFX11-NEXT: v_div_fixup_f16 v2, v3, v2, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } @@ -1489,6 +1737,21 @@ ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_rcp_f16_e32 v1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_rcp_f16_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_f16_e32 v2, v3, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0 ret <2 x half> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -11,6 +11,9 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-IEEE %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-FLUSH %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-IEEE %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FLUSH %s + define float @v_fdiv_f32(float %a, float %b) { ; GFX6-IEEE-LABEL: v_fdiv_f32: ; GFX6-IEEE: ; %bb.0: @@ -115,6 +118,52 @@ ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b ret float %fdiv } @@ -134,6 +183,15 @@ ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f32_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn float %a, %b ret float %fdiv } @@ -212,6 +270,43 @@ ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v1| +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float %a, %b, !fpmath !0 ret float %fdiv } @@ -320,6 +415,52 @@ ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_rcp_f32: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_rcp_f32: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x ret float %fdiv } @@ -428,6 +569,52 @@ ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_rcp_f32_arcp: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX11-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_rcp_f32_arcp: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float 1.0, %x ret float %fdiv } @@ -445,6 +632,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f32_arcp_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn float 1.0, %x ret float %fdiv } @@ -487,6 +681,28 @@ ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_rcp_f32_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v0| +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x2f800000, s0 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_rcp_f32_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv float 1.0, %x, !fpmath !0 ret float %fdiv } @@ -506,6 +722,15 @@ ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f32_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn float %a, %b, !fpmath !0 ret float %fdiv } @@ -584,6 +809,43 @@ ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v1| +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp float %a, %b, !fpmath !0 ret float %fdiv } @@ -766,6 +1028,85 @@ ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f32: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 +; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f32: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX11-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 +; GFX11-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b ret <2 x float> %fdiv } @@ -789,6 +1130,16 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f32_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x float> %a, %b ret <2 x float> %fdiv } @@ -914,6 +1265,62 @@ ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f32_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 +; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f32_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v2| +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s0 +; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v3| +; GFX11-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v2, v2, v4 :: v_dual_mul_f32 v3, v3, v5 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } @@ -1096,6 +1503,85 @@ ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_rcp_v2f32: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 +; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_rcp_v2f32: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v1, v1, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x ret <2 x float> %fdiv } @@ -1278,6 +1764,85 @@ ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_rcp_v2f32_arcp: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 +; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_rcp_v2f32_arcp: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v1, v1, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX11-FLUSH-NEXT: s_denorm_mode 3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX11-FLUSH-NEXT: s_denorm_mode 0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> , %x ret <2 x float> %fdiv } @@ -1297,6 +1862,14 @@ ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f32_arcp_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x float> , %x ret <2 x float> %fdiv } @@ -1353,6 +1926,33 @@ ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_rcp_v2f32_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v0| +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s0 +; GFX11-IEEE-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v1| +; GFX11-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX11-IEEE-NEXT: v_dual_mul_f32 v0, v2, v0 :: v_dual_mul_f32 v1, v3, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_rcp_v2f32_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x, !fpmath !0 ret <2 x float> %fdiv } @@ -1376,6 +1976,16 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f32_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } @@ -1501,6 +2111,62 @@ ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX11-IEEE: ; %bb.0: +; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 +; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 +; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 +; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 +; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25: +; GFX11-FLUSH: ; %bb.0: +; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v2| +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s0 +; GFX11-FLUSH-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |v3| +; GFX11-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v2, v2, v4 :: v_dual_mul_f32 v3, v3, v5 +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-FLUSH-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1 +; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } @@ -1524,6 +2190,16 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f32_arcp_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f32_e32 v2, v2 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x float> %a, %b, !fpmath !0 ret <2 x float> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -11,6 +11,9 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + define double @v_fdiv_f64(double %a, double %b) { ; GFX6-LABEL: v_fdiv_f64: ; GFX6: ; %bb.0: @@ -79,6 +82,28 @@ ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double %a, %b ret double %fdiv } @@ -110,6 +135,24 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f64_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b ret double %fdiv } @@ -182,6 +225,28 @@ ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f64_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double %a, %b, !fpmath !0 ret double %fdiv } @@ -255,6 +320,28 @@ ; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double 1.0, %x ret double %fdiv } @@ -328,6 +415,28 @@ ; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f64_arcp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp double 1.0, %x ret double %fdiv } @@ -359,6 +468,24 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f64_arcp_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GFX11-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn double 1.0, %x ret double %fdiv } @@ -432,6 +559,28 @@ ; GFX10-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_f64_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv double 1.0, %x, !fpmath !0 ret double %fdiv } @@ -463,6 +612,24 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f64_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn double %a, %b, !fpmath !0 ret double %fdiv } @@ -535,6 +702,28 @@ ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_f64_arcp_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp double %a, %b, !fpmath !0 ret double %fdiv } @@ -658,6 +847,44 @@ ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] +; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b ret <2 x double> %fdiv } @@ -705,6 +932,35 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] ; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f64_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX11-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b ret <2 x double> %fdiv } @@ -828,6 +1084,44 @@ ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f64_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] +; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } @@ -952,6 +1246,44 @@ ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] +; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x ret <2 x double> %fdiv } @@ -1076,6 +1408,44 @@ ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f64_arcp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] +; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> , %x ret <2 x double> %fdiv } @@ -1123,6 +1493,35 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] ; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f64_arcp_afn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[8:9], 1.0, v[4:5] +; GFX11-NEXT: v_mul_f64 v[10:11], 1.0, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp afn <2 x double> , %x ret <2 x double> %fdiv } @@ -1247,6 +1646,44 @@ ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_rcp_v2f64_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] +; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x, !fpmath !0 ret <2 x double> %fdiv } @@ -1294,6 +1731,35 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] ; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f64_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX11-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } @@ -1417,6 +1883,44 @@ ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f64_arcp_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] +; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] +; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] +; GFX11-NEXT: s_mov_b32 vcc_lo, s0 +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } @@ -1464,6 +1968,35 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] ; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fdiv_v2f64_arcp_afn_ulp25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f64 v[12:13], v[0:1], v[8:9] +; GFX11-NEXT: v_mul_f64 v[14:15], v[2:3], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[12:13], v[0:1] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[6:7], v[14:15], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[12:13] +; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[14:15] +; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv afn arcp <2 x double> %a, %b, !fpmath !0 ret <2 x double> %fdiv } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { ; GFX9-LABEL: store_load_sindex_kernel: @@ -57,6 +58,23 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_sindex_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s1, s0, 2 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -116,6 +134,18 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:128 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_vindex_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:128 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -175,6 +205,20 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_vindex_foo: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [32 x float], align 4, addrspace(5) %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* @@ -213,6 +257,15 @@ ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_ptr_foo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 store float 1.000000e+01, float addrspace(5)* %gep, align 4 ret void @@ -278,6 +331,25 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:260 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_sindex_small_offset_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s1, s0, 2 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:260 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -347,6 +419,20 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:384 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_vindex_small_offset_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:384 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -419,6 +505,21 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_vindex_small_offset_foo: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [64 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -497,6 +598,27 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_sindex_large_offset_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 +; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s1, s0, 2 +; GFX11-NEXT: s_and_b32 s0, s0, 15 +; GFX11-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 +; GFX11-NEXT: scratch_load_b32 v0, v2, vcc_lo glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -568,6 +690,22 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi offset:124 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_vindex_large_offset_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 +; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 +; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo offset:124 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -642,6 +780,23 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_vindex_large_offset_foo: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %padding = alloca [4096 x i32], align 4, addrspace(5) %i = alloca [32 x float], align 4, addrspace(5) @@ -706,6 +861,18 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_large_imm_offset_kernel: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -764,6 +931,20 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_large_imm_offset_foo: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = alloca [4096 x i32], align 4, addrspace(5) %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef @@ -819,6 +1000,18 @@ ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_endpgm +; +; GFX11-LABEL: store_load_vidx_sidx_offset: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_endpgm bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -863,6 +1056,18 @@ ; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_i64_aligned: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 8 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 @@ -902,6 +1107,18 @@ ; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_i64_unaligned: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: store volatile i64 15, i64 addrspace(5)* %arg, align 1 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 @@ -954,6 +1171,21 @@ ; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_v3i32_unaligned: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s2, 3 +; GFX11-NEXT: s_mov_b32 s1, 2 +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: store volatile <3 x i32> , <3 x i32> addrspace(5)* %arg, align 1 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 @@ -1010,6 +1242,22 @@ ; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_load_v4i32_unaligned: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s3, 4 +; GFX11-NEXT: s_mov_b32 s2, 3 +; GFX11-NEXT: s_mov_b32 s1, 2 +; GFX11-NEXT: s_mov_b32 s0, 1 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: store volatile <4 x i32> , <4 x i32> addrspace(5)* %arg, align 1 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define float @v_fma_f32(float %x, float %y, float %z) { ; GFX6-LABEL: v_fma_f32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { ; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: @@ -91,6 +92,24 @@ ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -219,6 +238,28 @@ ; GFX10-NEXT: v_max_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_sub_f32 v1, 0x80000000, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -330,6 +371,25 @@ ; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX11-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, |v2|, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -451,6 +511,26 @@ ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1| +; GFX11-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2| +; GFX11-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -571,6 +651,25 @@ ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid @@ -714,6 +813,28 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 +; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX9-LABEL: v_fmul_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define float @v_pow_f32(float %x, float %y) { ; GFX6-LABEL: v_pow_f32: @@ -37,6 +38,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -86,6 +98,19 @@ ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f32_e32 v1, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: v_log_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v0, v2 :: v_dual_mul_dx9_zero_f32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow } @@ -135,6 +160,21 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f16_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call half @llvm.pow.f16(half %x, half %y) ret half %pow } @@ -214,6 +254,35 @@ ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow } @@ -301,6 +370,36 @@ ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16_fneg_lhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) ret <2 x half> %pow @@ -389,6 +488,36 @@ ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16_fneg_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) ret <2 x half> %pow @@ -485,6 +614,38 @@ ; GFX10-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_log_f16_e32 v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_log_f16_e32 v2, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg) @@ -530,6 +691,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fabs_lhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e64 v0, |v0| +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) ret float %pow @@ -568,6 +740,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) ret float %pow @@ -606,6 +789,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, |v1| ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e64 v0, |v0| +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %fabs.x, float %fabs.y) @@ -640,6 +834,15 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_pow_f32_sgpr_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_log_f32_e32 v1, s0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -672,6 +875,15 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_pow_f32_vgpr_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -704,6 +916,15 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_pow_f32_sgpr_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_log_f32_e32 v0, s0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -741,6 +962,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fneg_lhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e64 v0, -v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %pow = call float @llvm.pow.f32(float %neg.x, float %y) ret float %pow @@ -779,6 +1011,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, v0, -v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fneg_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg float %y %pow = call float @llvm.pow.f32(float %x, float %neg.y) ret float %pow diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6-LABEL: s_fshl_i7: @@ -126,6 +127,48 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i7: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX11-NEXT: s_and_b32 s2, s2, 0x7f +; GFX11-NEXT: s_and_b32 s1, s1, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) ret i7 %result } @@ -250,6 +293,47 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v3, 0x7f, v3 +; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) ret i7 %result } @@ -300,6 +384,19 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s3, s2, 7 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt) ret i8 %result } @@ -356,6 +453,23 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt) ret i8 %result } @@ -394,6 +508,16 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i8_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) ret i8 %result } @@ -434,6 +558,17 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 4, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i8_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, 4, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) ret i8 %result } @@ -472,6 +607,16 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, 3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i8_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 5 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s1, s1, 3 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) ret i8 %result } @@ -512,6 +657,17 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i8_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, 5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, 3, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) ret i8 %result } @@ -623,6 +779,35 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v2i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s6, s2, 7 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s6, s5, 7 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshl_b32 s3, s3, s6 +; GFX11-NEXT: s_lshr_b32 s4, s4, s5 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %amt = bitcast i16 %amt.arg to <2 x i8> @@ -737,6 +922,39 @@ ; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-NEXT: v_lshrrev_b16 v4, 1, v4 +; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b16 v4, v6, v4 +; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %amt = bitcast i16 %amt.arg to <2 x i8> @@ -950,6 +1168,60 @@ ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: s_lshr_b32 s11, s2, 24 +; GFX11-NEXT: s_and_b32 s12, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_and_b32 s2, s6, 0xff +; GFX11-NEXT: s_and_b32 s6, s9, 7 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_lshl_b32 s0, s0, s12 +; GFX11-NEXT: s_lshl_b32 s3, s3, s6 +; GFX11-NEXT: s_lshr_b32 s2, s2, s9 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s7, 0xff +; GFX11-NEXT: s_and_b32 s3, s10, 7 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_lshl_b32 s3, s4, s3 +; GFX11-NEXT: s_lshr_b32 s2, s2, s6 +; GFX11-NEXT: s_and_b32 s4, s11, 7 +; GFX11-NEXT: s_and_not1_b32 s6, 7, s11 +; GFX11-NEXT: s_lshr_b32 s7, s8, 1 +; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_lshr_b32 s5, s7, s6 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %amt = bitcast i32 %amt.arg to <4 x i8> @@ -1165,6 +1437,64 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX11-NEXT: v_lshrrev_b16 v6, 1, v6 +; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, v11, v5 +; GFX11-NEXT: v_lshrrev_b16 v7, v13, v8 +; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %amt = bitcast i32 %amt.arg to <4 x i8> @@ -1290,6 +1620,44 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffffff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x170001 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) ret i24 %result } @@ -1412,6 +1780,44 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_bfe_u32 v1, v1, 1, 23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 +; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) ret i24 %result } @@ -1923,6 +2329,140 @@ ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_bfe_u32 s9, 8, 0x100000 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, s9 +; GFX11-NEXT: s_lshr_b32 s11, s4, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 +; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_lshr_b32 s6, s4, 8 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 +; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, s9 +; GFX11-NEXT: s_lshr_b32 s12, s5, 8 +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s6 +; GFX11-NEXT: s_lshl_b32 s5, s5, s9 +; GFX11-NEXT: s_and_b32 s6, s12, 0xff +; GFX11-NEXT: s_or_b32 s5, s11, s5 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX11-NEXT: s_or_b32 s5, s5, s6 +; GFX11-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX11-NEXT: s_and_b32 s7, s10, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, s9 +; GFX11-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX11-NEXT: s_lshr_b32 s7, s2, 8 +; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: s_lshr_b32 s8, s2, 16 +; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-NEXT: s_lshl_b32 s7, s7, s9 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: s_lshr_b32 s4, s2, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_or_b32 s2, s2, s7 +; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX11-NEXT: s_and_b32 s5, s8, 0xff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_or_b32 s2, s2, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_and_b32 s4, s10, 0xff +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s6, s6, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v0, 0xffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX11-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX11-NEXT: s_lshl_b32 s3, s4, 16 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX11-NEXT: s_lshr_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v2, v3, s0 +; GFX11-NEXT: s_or_b32 s0, s1, s6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX11-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 8, v4 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> %amt = bitcast i48 %amt.arg to <2 x i24> @@ -2143,6 +2683,72 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_bfe_u32 v2, v2, 1, 23 +; GFX11-NEXT: v_bfe_u32 v3, v3, 1, 23 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_mul_f32 v7, 0x4f7ffffe, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX11-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 +; GFX11-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX11-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX11-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result } @@ -2189,6 +2795,17 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_not_b32 s1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2220,6 +2837,13 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, -5 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i32_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, -5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5) ret i32 %result } @@ -2251,6 +2875,13 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, -8 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i32_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, -8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8) ret i32 %result } @@ -2292,6 +2923,17 @@ ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2321,6 +2963,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, -5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i32_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, -5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5) ret i32 %result } @@ -2350,6 +2999,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, -8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i32_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, -8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8) ret i32 %result } @@ -2389,6 +3045,15 @@ ; GFX10-NEXT: s_lshr_b32 s0, s0, 1 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i32_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v1, s0, s1, 1 +; GFX11-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2429,6 +3094,15 @@ ; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i32_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_not_b32 s1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2472,6 +3146,15 @@ ; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i32_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_not_b32 s1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2530,6 +3213,21 @@ ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX11-NEXT: v_alignbit_b32 v3, v1, v3, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result } @@ -2603,6 +3301,25 @@ ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX11-NEXT: v_alignbit_b32 v4, v1, v4, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_alignbit_b32 v5, v2, v5, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) ret <3 x i32> %result } @@ -2692,6 +3409,29 @@ ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_alignbit_b32 v5, v1, v5, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX11-NEXT: v_alignbit_b32 v6, v2, v6, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX11-NEXT: v_alignbit_b32 v7, v3, v7, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result } @@ -2750,6 +3490,21 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s3, s2, 15 +; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_lshr_b32 s1, s1, s4 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) ret i16 %result } @@ -2791,6 +3546,17 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i16_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_bfe_u32 s2, 4, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, 12, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4) ret i16 %result } @@ -2832,6 +3598,17 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i16_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_bfe_u32 s2, 5, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, 11, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5) ret i16 %result } @@ -2887,6 +3664,21 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) ret i16 %result } @@ -2924,6 +3716,16 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 12, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i16_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, 12, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4) ret i16 %result } @@ -2961,6 +3763,16 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 11, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i16_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b16 v0, 5, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, 11, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5) ret i16 %result } @@ -3017,6 +3829,21 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i16_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX11-NEXT: v_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3067,6 +3894,19 @@ ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i16_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshrrev_b16 v0, 1, v0 +; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 +; GFX11-NEXT: s_and_b32 s1, s1, 15 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3123,6 +3963,21 @@ ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i16_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s2, s1, 15 +; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, s3 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3227,6 +4082,30 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s6, s1, 0xffff +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX11-NEXT: s_lshr_b32 s6, s6, 0x10001 +; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_not1_b32 s2, 0xf000f, s2 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_lshl_b32 s3, s4, s5 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshr_b32 s5, s2, 16 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_lshr_b32 s2, s4, s5 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -3304,6 +4183,21 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) ret <2 x i16> %result } @@ -3360,6 +4254,16 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 0x8000c, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i16_4_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 0x80004, v0 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 0x8000c, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> ) ret <2 x i16> %result } @@ -3446,6 +4350,23 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_v2i16_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: v_and_b32_e32 v0, 0xf000f, v0 +; GFX11-NEXT: s_lshr_b32 s1, s1, 0x10001 +; GFX11-NEXT: v_and_b32_e32 v1, 0xf000f, v1 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to float ret float %cast @@ -3527,6 +4448,22 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_v2i16_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX11-NEXT: s_and_not1_b32 s1, 0xf000f, s1 +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: v_pk_lshrrev_b16 v0, s1, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s1, s3, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to float ret float %cast @@ -3620,6 +4557,26 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_v2i16_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s3, s0, 0xffff +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, 0x10001 +; GFX11-NEXT: s_lshr_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX11-NEXT: s_and_not1_b32 s1, 0xf000f, s1 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, s2, v0 +; GFX11-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_lshr_b32 s1, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to float ret float %cast @@ -3820,6 +4777,48 @@ ; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s9, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX11-NEXT: s_lshr_b32 s9, s9, 0x10001 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_lshr_b32 s8, s6, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s6, s7, s8 +; GFX11-NEXT: s_lshr_b32 s7, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, s4 +; GFX11-NEXT: s_lshr_b32 s4, s7, s8 +; GFX11-NEXT: s_and_b32 s8, s3, 0xffff +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX11-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX11-NEXT: s_lshr_b32 s8, s8, 0x10001 +; GFX11-NEXT: s_lshr_b32 s3, s3, 1 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-NEXT: s_and_not1_b32 s5, 0xf000f, s5 +; GFX11-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s8, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, s4 +; GFX11-NEXT: s_lshl_b32 s4, s6, s7 +; GFX11-NEXT: s_lshr_b32 s6, s3, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshr_b32 s7, s5, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s5, s6, s7 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) %cast.result = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast.result @@ -3947,6 +4946,28 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX11-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX11-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v7, 0xf000f, v7 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX11-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) %cast.result = bitcast <4 x i16> %result to <4 x half> ret <4 x half> %cast.result @@ -3962,6 +4983,17 @@ ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 63 +; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[4:5] +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) ret i64 %result } @@ -3974,6 +5006,15 @@ ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i64_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GFX11-NEXT: s_lshr_b32 s2, s3, 27 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5) ret i64 %result } @@ -3987,6 +5028,16 @@ ; GCN-NEXT: s_mov_b32 s3, s0 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i64_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s2, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32) ret i64 %result } @@ -3999,6 +5050,15 @@ ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i64_48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshl_b32 s1, s0, 16 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48) ret i64 %result } @@ -4056,6 +5116,23 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) ret i64 %result } @@ -4093,6 +5170,16 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 27, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i64_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1] +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 27, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5) ret i64 %result } @@ -4126,6 +5213,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i64_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32) ret i64 %result } @@ -4167,6 +5261,17 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i64_48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48) ret i64 %result } @@ -4219,6 +5324,21 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i64_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 63, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4268,6 +5388,19 @@ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i64_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3] +; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4317,6 +5450,20 @@ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i64_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4386,6 +5533,22 @@ ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[8:9] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], 63 +; GFX11-NEXT: s_and_not1_b64 s[10:11], 63, s[10:11] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result } @@ -4475,6 +5638,32 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX11-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v10, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v11, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result } @@ -4667,6 +5856,54 @@ ; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX11-NEXT: s_and_not1_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: s_sub_i32 s9, s12, 64 +; GFX11-NEXT: s_sub_i32 s10, 64, s12 +; GFX11-NEXT: s_cmp_lt_u32 s12, 64 +; GFX11-NEXT: s_cselect_b32 s18, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s12, 0 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s10 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s12 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX11-NEXT: s_lshl_b32 s5, s6, 31 +; GFX11-NEXT: s_mov_b32 s4, s11 +; GFX11-NEXT: s_sub_i32 s14, s8, 64 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 +; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: s_cselect_b32 s15, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result } @@ -4864,6 +6101,63 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 +; GFX11-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v18 +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] +; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 31, v6 +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v5, v5, v12 +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 +; GFX11-NEXT: v_or_b32_e32 v0, v12, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result } @@ -5072,6 +6366,63 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i128_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX11-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX11-NEXT: s_lshl_b32 s9, s6, 31 +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] +; GFX11-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 +; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> ret <4 x float> %cast.result @@ -5297,6 +6648,62 @@ ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i128_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: s_sub_i32 s5, s8, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 31, v2 +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_sub_i32 s0, 64, s4 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] +; GFX11-NEXT: s_sub_i32 s0, s4, 64 +; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_and_b32 s0, 1, s1 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s5 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] +; GFX11-NEXT: v_dual_cndmask_b32 v5, v9, v5 :: v_dual_cndmask_b32 v4, v8, v4 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> ret <4 x float> %cast.result @@ -5522,6 +6929,59 @@ ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshl_i128_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: s_sub_i32 s5, s8, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX11-NEXT: s_cselect_b32 s10, 1, 0 +; GFX11-NEXT: s_and_b32 s6, 1, s9 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_lshl_b32 s9, s2, 31 +; GFX11-NEXT: s_mov_b32 s8, s7 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX11-NEXT: s_and_b32 s5, 1, s10 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_sub_i32 s10, s4, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> ret <4 x float> %cast.result @@ -5571,6 +7031,18 @@ ; GFX10-NEXT: s_lshr_b32 s2, s7, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_i128_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s2, s5, 31 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX11-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: s_lshr_b32 s2, s7, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result } @@ -5620,6 +7092,19 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_i128_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[6:7] +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 31, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result } @@ -5980,6 +7465,96 @@ ; GFX10-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshl_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[18:19], 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX11-NEXT: s_and_not1_b64 s[16:17], s[18:19], s[16:17] +; GFX11-NEXT: s_sub_i32 s17, s22, 64 +; GFX11-NEXT: s_sub_i32 s23, 64, s22 +; GFX11-NEXT: s_cmp_lt_u32 s22, 64 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s22, 0 +; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s23 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 +; GFX11-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s29, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX11-NEXT: s_lshl_b32 s9, s10, 31 +; GFX11-NEXT: s_mov_b32 s8, s19 +; GFX11-NEXT: s_sub_i32 s26, s16, 64 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 +; GFX11-NEXT: s_sub_i32 s17, 64, s16 +; GFX11-NEXT: s_cmp_lt_u32 s16, 64 +; GFX11-NEXT: s_cselect_b32 s27, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s16, 0 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX11-NEXT: s_and_not1_b64 s[10:11], s[18:19], s[20:21] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX11-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_sub_i32 s11, s8, 64 +; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: s_cselect_b32 s18, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 +; GFX11-NEXT: s_lshl_b64 s[20:21], s[6:7], s8 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 +; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 +; GFX11-NEXT: s_lshl_b32 s13, s14, 31 +; GFX11-NEXT: s_mov_b32 s12, s19 +; GFX11-NEXT: s_sub_i32 s18, s10, 64 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[12:13] +; GFX11-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 +; GFX11-NEXT: s_sub_i32 s11, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 +; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result } @@ -6349,6 +7924,99 @@ ; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshl_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] +; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 31, v10 +; GFX11-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v16 +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] +; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 +; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v28 +; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 +; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v26 +; GFX11-NEXT: v_dual_cndmask_b32 v18, v0, v18 :: v_dual_cndmask_b32 v19, v1, v19 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v23, s0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v10, v17, v24, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v22, v19, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v16, v8, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v9, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v0, s0 +; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 +; GFX11-NEXT: v_or_b32_e32 v0, v21, v3 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v20 +; GFX11-NEXT: v_cndmask_b32_e64 v24, 0, v1, s0 +; GFX11-NEXT: v_or_b32_e32 v1, v11, v8 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v23 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 31, v14 +; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v3 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] +; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v25 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v23 +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v23, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v25 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v25, v[8:9] +; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v23 +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v25, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v25 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v9, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 +; GFX11-NEXT: v_or_b32_e32 v3, v22, v24 +; GFX11-NEXT: v_or_b32_e32 v4, v13, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v14, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6-LABEL: s_fshr_i7: @@ -124,6 +125,46 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i7: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX11-NEXT: s_and_b32 s2, s2, 0x7f +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s1, s1, 0x7f +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, -7, v0 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u16 v1, 6, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) ret i7 %result } @@ -249,6 +290,47 @@ ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v4, -7, v3 +; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0x7f, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) ret i7 %result } @@ -300,6 +382,19 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_and_b32 s3, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt) ret i8 %result } @@ -355,6 +450,22 @@ ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt) ret i8 %result } @@ -393,6 +504,16 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i8_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4) ret i8 %result } @@ -433,6 +554,17 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 4, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i8_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, 4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, 4, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4) ret i8 %result } @@ -471,6 +603,16 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, 5 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i8_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b32 s1, s1, 5 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5) ret i8 %result } @@ -511,6 +653,17 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 5, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i8_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b16 v1, 5, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5) ret i8 %result } @@ -623,6 +776,35 @@ ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v2i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s5, 7 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 +; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s2, s4, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s6 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %amt = bitcast i16 %amt.arg to <2 x i8> @@ -736,6 +918,40 @@ ; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v2 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_lshrrev_b16 v3, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b16 v4, v6, v4 +; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i16 %lhs.arg to <2 x i8> %rhs = bitcast i16 %rhs.arg to <2 x i8> %amt = bitcast i16 %amt.arg to <2 x i8> @@ -951,6 +1167,60 @@ ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s6, s1, 8 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11-NEXT: s_lshr_b32 s8, s1, 24 +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: s_lshr_b32 s11, s2, 24 +; GFX11-NEXT: s_and_b32 s12, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_and_b32 s2, s9, 7 +; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_lshr_b32 s1, s1, s12 +; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: s_lshr_b32 s2, s6, s2 +; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_or_b32 s1, s3, s2 +; GFX11-NEXT: s_and_b32 s2, s10, 7 +; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: s_lshl_b32 s3, s4, s3 +; GFX11-NEXT: s_lshr_b32 s2, s6, s2 +; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_and_b32 s6, s11, 7 +; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_lshr_b32 s5, s8, s6 +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s1, 8 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, 16 +; GFX11-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %amt = bitcast i32 %amt.arg to <4 x i8> @@ -1166,6 +1436,64 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 +; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11 +; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 +; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 +; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-NEXT: v_lshlrev_b16 v4, v12, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v11, v8 +; GFX11-NEXT: v_lshlrev_b16 v5, v7, v5 +; GFX11-NEXT: v_lshrrev_b16 v7, v13, v9 +; GFX11-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lhs = bitcast i32 %lhs.arg to <4 x i8> %rhs = bitcast i32 %rhs.arg to <4 x i8> %amt = bitcast i32 %amt.arg to <4 x i8> @@ -1295,6 +1623,45 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffffff +; GFX11-NEXT: s_and_b32 s1, s1, 0xffffff +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX11-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 23, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) ret i24 %result } @@ -1421,6 +1788,45 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 +; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v3, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) ret i24 %result } @@ -1932,6 +2338,142 @@ ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_bfe_u32 s9, 8, 0x100000 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX11-NEXT: s_lshr_b32 s7, s0, 16 +; GFX11-NEXT: s_lshr_b32 s8, s0, 24 +; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, s9 +; GFX11-NEXT: s_lshr_b32 s10, s1, 8 +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_and_b32 s7, s10, 0xff +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1 +; GFX11-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-NEXT: s_lshr_b32 s11, s4, 16 +; GFX11-NEXT: s_and_b32 s10, s10, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11-NEXT: s_and_b32 s12, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s10, s10, s9 +; GFX11-NEXT: s_and_b32 s11, s11, 0xff +; GFX11-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 +; GFX11-NEXT: s_or_b32 s10, s12, s10 +; GFX11-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 +; GFX11-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX11-NEXT: s_bfe_u32 s10, s10, 0x100000 +; GFX11-NEXT: s_lshl_b32 s11, s11, 16 +; GFX11-NEXT: s_lshr_b32 s12, s5, 8 +; GFX11-NEXT: s_or_b32 s10, s10, s11 +; GFX11-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX11-NEXT: s_and_b32 s5, s5, 0xff +; GFX11-NEXT: s_lshr_b32 s4, s4, 24 +; GFX11-NEXT: s_lshl_b32 s5, s5, s9 +; GFX11-NEXT: s_and_b32 s11, s12, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: s_bfe_u32 s5, s11, 0x100000 +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX11-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX11-NEXT: s_lshl_b32 s5, s5, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_or_b32 s4, s4, s5 +; GFX11-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, s9 +; GFX11-NEXT: s_lshr_b32 s11, s2, 16 +; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX11-NEXT: s_lshr_b32 s8, s2, 8 +; GFX11-NEXT: s_lshr_b32 s5, s2, 24 +; GFX11-NEXT: s_and_b32 s8, s8, 0xff +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s8, s8, s9 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: s_and_b32 s8, s11, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s10, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX11-NEXT: s_lshr_b32 s10, s3, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_lshl_b32 s8, s8, 16 +; GFX11-NEXT: s_lshl_b32 s3, s3, s9 +; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX11-NEXT: s_and_b32 s4, s10, 0xff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_or_b32 s2, s2, s8 +; GFX11-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_or_b32 s3, s5, s3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX11-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_lshl_b32 s4, s4, 16 +; GFX11-NEXT: s_lshl_b32 s5, s6, 17 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_or_b32 s0, s5, s0 +; GFX11-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_lshl_b32 s1, s1, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: s_lshl_b32 s0, s7, 17 +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX11-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 8, v4 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog %lhs = bitcast i48 %lhs.arg to <2 x i24> %rhs = bitcast i48 %rhs.arg to <2 x i24> %amt = bitcast i48 %amt.arg to <2 x i24> @@ -2160,6 +2702,74 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1 +; GFX11-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX11-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_lo_u32 v8, 0xffffffe8, v6 +; GFX11-NEXT: v_mul_lo_u32 v9, 0xffffffe8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX11-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX11-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result } @@ -2195,6 +2805,14 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2226,6 +2844,13 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 5 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i32_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5) ret i32 %result } @@ -2257,6 +2882,13 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 8 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i32_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8) ret i32 %result } @@ -2286,6 +2918,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) ret i32 %result } @@ -2315,6 +2954,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, 5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i32_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, 5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5) ret i32 %result } @@ -2344,6 +2990,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i32_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, 8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8) ret i32 %result } @@ -2371,6 +3024,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i32_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2399,6 +3057,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i32_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2431,6 +3094,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i32_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) %cast.result = bitcast i32 %result to float ret float %cast.result @@ -2465,6 +3135,14 @@ ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) ret <2 x i32> %result } @@ -2502,6 +3180,15 @@ ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) ret <3 x i32> %result } @@ -2543,6 +3230,16 @@ ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) ret <4 x i32> %result } @@ -2602,6 +3299,21 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s3, s2, 15 +; GFX11-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX11-NEXT: s_and_not1_b32 s2, 15, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) ret i16 %result } @@ -2643,6 +3355,17 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i16_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_bfe_u32 s2, 12, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, 4, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4) ret i16 %result } @@ -2684,6 +3407,17 @@ ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i16_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_bfe_u32 s3, 5, 0x100000 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5) ret i16 %result } @@ -2740,6 +3474,21 @@ ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) ret i16 %result } @@ -2777,6 +3526,16 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 4, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i16_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, 4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4) ret i16 %result } @@ -2814,6 +3573,16 @@ ; GFX10-NEXT: v_lshrrev_b16 v1, 5, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i16_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b16 v0, 11, v0 +; GFX11-NEXT: v_lshrrev_b16 v1, 5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5) ret i16 %result } @@ -2868,6 +3637,21 @@ ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i16_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b16 v0, v0, s1 +; GFX11-NEXT: v_lshlrev_b16 v1, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -2922,6 +3706,20 @@ ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i16_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b32 s2, s1, 15 +; GFX11-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX11-NEXT: s_and_not1_b32 s1, 15, s1 +; GFX11-NEXT: v_lshrrev_b16 v0, s2, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -2976,6 +3774,19 @@ ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i16_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: s_and_not1_b32 s2, 15, s1 +; GFX11-NEXT: s_and_b32 s1, s1, 15 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX11-NEXT: v_lshlrev_b16 v0, s2, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) %cast.result = bitcast i16 %result to half ret half %cast.result @@ -3107,6 +3918,29 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_and_b32 s4, s2, 0xf000f +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX11-NEXT: s_and_not1_b32 s2, 0xf000f, s2 +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s2, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, s2 +; GFX11-NEXT: s_lshl_b32 s2, s3, s5 +; GFX11-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-NEXT: s_and_b32 s1, s1, 0xffff +; GFX11-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11-NEXT: s_lshr_b32 s1, s1, s4 +; GFX11-NEXT: s_lshr_b32 s3, s3, s5 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -3211,6 +4045,21 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) ret <2 x i16> %result } @@ -3267,6 +4116,16 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 0x80004, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i16_4_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 0x8000c, v0 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 0x80004, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> ) ret <2 x i16> %result } @@ -3380,6 +4239,21 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_v2i16_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xf000f, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xf000f, v1 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-NEXT: v_pk_lshrrev_b16 v0, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to float ret float %cast @@ -3493,6 +4367,25 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_v2i16_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-NEXT: s_lshl_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s3, s1, 0xf000f +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX11-NEXT: s_and_not1_b32 s1, 0xf000f, s1 +; GFX11-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: v_pk_lshrrev_b16 v0, s3, v0 +; GFX11-NEXT: s_lshl_b32 s0, s0, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to float ret float %cast @@ -3607,6 +4500,23 @@ ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_v2i16_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX11-NEXT: s_and_not1_b32 s1, 0xf000f, s1 +; GFX11-NEXT: s_lshr_b32 s3, s0, 16 +; GFX11-NEXT: s_and_b32 s0, s0, 0xffff +; GFX11-NEXT: s_lshr_b32 s4, s2, 16 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, s1, v0 +; GFX11-NEXT: s_lshr_b32 s0, s0, s2 +; GFX11-NEXT: s_lshr_b32 s1, s3, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) %cast = bitcast <2 x i16> %result to float ret float %cast @@ -3858,6 +4768,46 @@ ; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX11-NEXT: s_lshl_b32 s6, s6, 1 +; GFX11-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s4 +; GFX11-NEXT: s_lshr_b32 s6, s0, 16 +; GFX11-NEXT: s_lshr_b32 s8, s4, 16 +; GFX11-NEXT: s_lshl_b32 s0, s0, s4 +; GFX11-NEXT: s_lshl_b32 s4, s6, s8 +; GFX11-NEXT: s_lshr_b32 s6, s2, 16 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX11-NEXT: s_lshr_b32 s4, s1, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshr_b32 s8, s7, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 0x10001 +; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, s7 +; GFX11-NEXT: s_lshr_b32 s6, s6, s8 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-NEXT: s_and_not1_b32 s4, 0xf000f, s5 +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX11-NEXT: s_and_b32 s6, s5, 0xf000f +; GFX11-NEXT: s_lshr_b32 s5, s1, 16 +; GFX11-NEXT: s_lshr_b32 s7, s4, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, s4 +; GFX11-NEXT: s_lshl_b32 s4, s5, s7 +; GFX11-NEXT: s_lshr_b32 s5, s3, 16 +; GFX11-NEXT: s_and_b32 s3, s3, 0xffff +; GFX11-NEXT: s_lshr_b32 s7, s6, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, s6 +; GFX11-NEXT: s_lshr_b32 s5, s5, s7 +; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) %cast.result = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast.result @@ -4036,6 +4986,28 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xf000f, v4 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xf000f, v6 +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v7, 0xf000f, v7 +; GFX11-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX11-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v7, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) %cast.result = bitcast <4 x i16> %result to <4 x half> ret <4 x half> %cast.result @@ -4081,6 +5053,17 @@ ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[6:7], 63, s[4:5] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_and_b64 s[4:5], s[4:5], 63 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) ret i64 %result } @@ -4093,6 +5076,15 @@ ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 5 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i64_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshl_b32 s1, s0, 27 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5) ret i64 %result } @@ -4106,6 +5098,16 @@ ; GCN-NEXT: s_mov_b32 s3, s0 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i64_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b32 s2, s3 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32) ret i64 %result } @@ -4118,6 +5120,15 @@ ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i64_48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX11-NEXT: s_lshr_b32 s2, s3, 16 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48) ret i64 %result } @@ -4175,6 +5186,23 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) ret i64 %result } @@ -4216,6 +5244,17 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 27, v4 ; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i64_5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, v0 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 27, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5) ret i64 %result } @@ -4249,6 +5288,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i64_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32) ret i64 %result } @@ -4283,6 +5329,16 @@ ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i64_48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48) ret i64 %result } @@ -4335,6 +5391,21 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i64_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 63, v1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4384,6 +5455,20 @@ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i64_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4433,6 +5518,19 @@ ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i64_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3] +; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4502,6 +5600,22 @@ ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 s[12:13], 63, s[8:9] +; GFX11-NEXT: s_and_b64 s[8:9], s[8:9], 63 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[10:11] +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b64 s[10:11], s[10:11], 63 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result } @@ -4591,6 +5705,32 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX11-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 63, v9 +; GFX11-NEXT: v_and_b32_e32 v11, 63, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 63, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) ret <2 x i64> %result } @@ -4783,6 +5923,53 @@ ; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_movk_i32 s10, 0x7f +; GFX11-NEXT: s_mov_b32 s11, 0 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX11-NEXT: s_and_not1_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: s_lshr_b32 s10, s1, 31 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX11-NEXT: s_sub_i32 s13, s8, 64 +; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_cselect_b32 s17, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s13 +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_sub_i32 s14, s12, 64 +; GFX11-NEXT: s_sub_i32 s10, 64, s12 +; GFX11-NEXT: s_cmp_lt_u32 s12, 64 +; GFX11-NEXT: s_cselect_b32 s15, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s12, 0 +; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX11-NEXT: s_cmp_lg_u32 s16, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result } @@ -4980,6 +6167,59 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v9 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 +; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX11-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 +; GFX11-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v13, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 +; GFX11-NEXT: v_or_b32_e32 v0, v14, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) ret i128 %result } @@ -5188,6 +6428,58 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX10-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i128_ssv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX11-NEXT: s_lshr_b32 s8, s1, 31 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v1 +; GFX11-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v12, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, s8, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, s9, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX11-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> ret <4 x float> %cast.result @@ -5417,6 +6709,64 @@ ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i128_svs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_movk_i32 s6, 0x7f +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: s_lshr_b32 s6, s1, 31 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX11-NEXT: s_sub_i32 s9, s4, 64 +; GFX11-NEXT: s_sub_i32 s5, 64, s4 +; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s5 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_sub_i32 s0, 64, s8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] +; GFX11-NEXT: s_sub_i32 s0, s8, 64 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-NEXT: s_and_b32 s0, 1, s1 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s6 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] +; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> ret <4 x float> %cast.result @@ -5638,6 +6988,57 @@ ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_fshr_i128_vss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v1 +; GFX11-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX11-NEXT: s_and_not1_b64 s[4:5], s[6:7], s[4:5] +; GFX11-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-NEXT: s_sub_i32 s6, 64, s4 +; GFX11-NEXT: s_sub_i32 s5, s4, 64 +; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] +; GFX11-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_and_b32 s4, 1, s7 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX11-NEXT: s_and_b32 s4, 1, s9 +; GFX11-NEXT: s_sub_i32 s10, s8, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 +; GFX11-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 +; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> ret <4 x float> %cast.result @@ -5691,6 +7092,18 @@ ; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_i128_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_lshl_b32 s5, s0, 31 +; GFX11-NEXT: s_lshl_b32 s3, s2, 31 +; GFX11-NEXT: s_mov_b32 s2, s4 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 +; GFX11-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result } @@ -5742,6 +7155,21 @@ ; GFX10-NEXT: v_or_b32_e32 v3, v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i128_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_lshlrev_b32 v9, 31, v0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 31, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) ret i128 %result } @@ -6102,6 +7530,95 @@ ; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_fshr_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_movk_i32 s18, 0x7f +; GFX11-NEXT: s_mov_b32 s19, 0 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX11-NEXT: s_and_not1_b64 s[16:17], s[18:19], s[16:17] +; GFX11-NEXT: s_lshr_b32 s24, s1, 31 +; GFX11-NEXT: s_mov_b32 s25, s19 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] +; GFX11-NEXT: s_sub_i32 s23, s16, 64 +; GFX11-NEXT: s_sub_i32 s17, 64, s16 +; GFX11-NEXT: s_cmp_lt_u32 s16, 64 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s16, 0 +; GFX11-NEXT: s_cselect_b32 s29, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 +; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s23 +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s29, 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX11-NEXT: s_sub_i32 s26, s22, 64 +; GFX11-NEXT: s_sub_i32 s23, 64, s22 +; GFX11-NEXT: s_cmp_lt_u32 s22, 64 +; GFX11-NEXT: s_cselect_b32 s27, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s22, 0 +; GFX11-NEXT: s_cselect_b32 s28, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s23 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[10:11], s22 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s27, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX11-NEXT: s_and_not1_b64 s[10:11], s[18:19], s[20:21] +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 +; GFX11-NEXT: s_lshr_b32 s18, s5, 31 +; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX11-NEXT: s_sub_i32 s9, s10, 64 +; GFX11-NEXT: s_sub_i32 s11, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 +; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX11-NEXT: s_sub_i32 s18, s8, 64 +; GFX11-NEXT: s_sub_i32 s9, 64, s8 +; GFX11-NEXT: s_cmp_lt_u32 s8, 64 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] +; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result } @@ -6471,6 +7988,108 @@ ; GFX10-NEXT: v_or_b32_e32 v6, v6, v9 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v10 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 31, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v17 +; GFX11-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v23, 0, v23 :: v_dual_and_b32 v26, 0x7f, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25 +; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] +; GFX11-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 +; GFX11-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v26 +; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] +; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] +; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s0 +; GFX11-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 +; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 31, v5 +; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v25 +; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, v3, s0 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v25, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v25 +; GFX11-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX11-NEXT: v_and_b32_e32 v23, 0x7f, v20 +; GFX11-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10 +; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v23 +; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo +; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[14:15] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v23, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v23 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v6, s2 +; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v16, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v18, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v12, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v8, v9, v13, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 +; GFX11-NEXT: v_or_b32_e32 v3, v22, v26 +; GFX11-NEXT: v_or_b32_e32 v4, v11, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v5, v14, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i16_s_s: @@ -72,6 +73,24 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_and_b32 s1, s5, 1 +; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, s1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -145,6 +164,23 @@ ; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v2, v[0:1], off +; GFX11-NEXT: s_and_b32 s0, s3, 1 +; GFX11-NEXT: s_and_b32 s1, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 +; GFX11-NEXT: s_lshl_b32 s0, s1, s0 +; GFX11-NEXT: s_not_b32 s1, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -214,6 +250,23 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_and_b32 s1, s4, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -288,6 +341,25 @@ ; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_and_b32 s1, s4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -359,6 +431,25 @@ ; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -433,6 +524,25 @@ ; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -503,6 +613,25 @@ ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v3, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -574,6 +703,25 @@ ; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx store <2 x i16> %insert, <2 x i16> addrspace(1)* null @@ -728,6 +876,30 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s0, s3, 1 +; GFX11-NEXT: s_and_b32 s1, s3, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, s1 +; GFX11-NEXT: s_not_b32 s2, s3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, v2, s2, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -833,6 +1005,32 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v4i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s2, s4, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cselect_b32 s3, s1, s0 +; GFX11-NEXT: s_and_b32 s4, s4, 1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s3, s3, s5 +; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -946,6 +1144,33 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v4i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX11-NEXT: s_and_b32 s2, s4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1056,6 +1281,34 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v4i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 4, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1154,6 +1407,32 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 1, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v3, 1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1248,6 +1527,31 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s1, s2, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v0, v1 :: v_dual_lshlrev_b32 v2, s0, v2 +; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_or_b32 v4, v3, s0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1343,6 +1647,33 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v4, 1, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v4 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx store <4 x i16> %insert, <4 x i16> addrspace(1)* null @@ -1490,6 +1821,42 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s6, s5, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cselect_b32 s7, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: s_cselect_b32 s7, s2, s7 +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b32 s7, s3, s7 +; GFX11-NEXT: s_and_b32 s5, s5, 1 +; GFX11-NEXT: s_and_b32 s4, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, s5, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s8, 0xffff, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, s5 +; GFX11-NEXT: s_and_not1_b32 s5, s7, s8 +; GFX11-NEXT: s_or_b32 s4, s5, s4 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_cselect_b32 s0, s4, s0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: s_cselect_b32 s1, s4, s1 +; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: s_cselect_b32 s2, s4, s2 +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b32 s3, s4, s3 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1613,6 +1980,38 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s4, s3, 1 +; GFX11-NEXT: s_and_b32 s1, s3, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX11-NEXT: s_lshl_b32 s3, s1, 4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s3 +; GFX11-NEXT: s_lshl_b32 s2, s2, s3 +; GFX11-NEXT: s_not_b32 s3, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v6, v4, s3, s2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1759,6 +2158,42 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s5, s4, 1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cselect_b32 s6, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_cselect_b32 s6, s2, s6 +; GFX11-NEXT: s_cmp_eq_u32 s5, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_cselect_b32 s6, s3, s6 +; GFX11-NEXT: s_and_b32 s4, s4, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s7, 0xffff, s4 +; GFX11-NEXT: s_and_not1_b32 s6, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -1911,6 +2346,41 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX11-NEXT: s_and_b32 s1, s4, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 4, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v5, -1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -2061,6 +2531,43 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v2, 4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 +; GFX11-NEXT: v_xor_b32_e32 v5, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -2185,6 +2692,38 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX11-NEXT: s_and_b32 s1, s2, 0xffff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v9, v2, v7, v0 +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v4, v9 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 +; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -2305,6 +2844,38 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s3, s2, 1 +; GFX11-NEXT: s_and_b32 s1, s2, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 +; GFX11-NEXT: s_lshl_b32 s2, s1, 4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 +; GFX11-NEXT: v_mov_b32_e32 v7, 0 +; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v1, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1 +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v9, v0, s2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 +; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -2426,6 +2997,38 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 +; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v8 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, v3, v2, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx store <8 x i16> %insert, <8 x i16> addrspace(1)* null @@ -2580,6 +3183,35 @@ ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 +; GFX11-NEXT: s_and_b32 s0, s5, 1 +; GFX11-NEXT: s_lshr_b32 m0, s5, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s1, s4, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 +; GFX11-NEXT: s_lshl_b32 s0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_movrels_b32 s3, s8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s1, s3, s2 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_movreld_b32 s8, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2707,6 +3339,33 @@ ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[6:9], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off +; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16 +; GFX11-NEXT: s_and_b32 s0, s3, 1 +; GFX11-NEXT: s_lshr_b32 m0, s3, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_and_b32 s1, s2, 0xffff +; GFX11-NEXT: s_lshl_b32 s2, 0xffff, s0 +; GFX11-NEXT: s_lshl_b32 s0, s1, s0 +; GFX11-NEXT: s_not_b32 s1, s2 +; GFX11-NEXT: v_mov_b32_e32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_movrels_b32_e32 v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v12, v0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_movreld_b32_e32 v2, v12 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX11-NEXT: global_store_b128 v[10:11], v[6:9], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -2860,6 +3519,34 @@ ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 +; GFX11-NEXT: s_and_b32 s0, s4, 1 +; GFX11-NEXT: s_lshr_b32 m0, s4, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GFX11-NEXT: s_lshl_b32 s1, 0xffff, s0 +; GFX11-NEXT: v_mov_b32_e32 v10, 16 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_movrels_b32 s2, s8 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: s_and_not1_b32 s1, s2, s1 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: v_lshl_or_b32 v12, v8, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_movreld_b32_e32 v0, v12 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3099,6 +3786,61 @@ ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v0 +; GFX11-NEXT: s_and_b32 s5, s4, 0xffff +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, s5 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 +; GFX11-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3336,6 +4078,63 @@ ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[8:15], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s9 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, v1, v0 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v5, s13 +; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v2, s10 +; GFX11-NEXT: v_mov_b32_e32 v7, s15 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: v_and_or_b32 v13, v10, v9, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, s12 +; GFX11-NEXT: v_mov_b32_e32 v6, s14 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v13 :: v_dual_mov_b32 v10, 16 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3530,6 +4329,58 @@ ; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX11-NEXT: s_and_b32 s5, s2, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v13, 16 :: v_dual_and_b32 v2, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 4, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, s5 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 +; GFX11-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v6, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v15, v1, v11, v2 +; GFX11-NEXT: v_mov_b32_e32 v11, 0 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_cndmask_b32 v1, v4, v15 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[11:12], v[0:3], off +; GFX11-NEXT: global_store_b128 v[13:14], v[4:7], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3654,6 +4505,33 @@ ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GFX10-NEXT: global_store_dwordx4 v[11:12], v[7:10], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 1 +; GFX11-NEXT: s_lshr_b32 m0, s2, 1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v11, 16 :: v_dual_lshlrev_b32 v0, s0, v0 +; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_movrels_b32_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v2, v1, s0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_movreld_b32_e32 v3, v2 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off +; GFX11-NEXT: global_store_b128 v[11:12], v[7:10], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null @@ -3845,6 +4723,58 @@ ; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i16_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:16 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX11-NEXT: v_dual_mov_b32 v14, 16 :: v_dual_and_b32 v3, 1, v3 +; GFX11-NEXT: v_dual_mov_b32 v15, 0 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v12 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: v_mov_b32_e32 v13, 0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v8, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v16, v1, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v[12:13], v[0:3], off +; GFX11-NEXT: global_store_b128 v[14:15], v[4:7], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx store <16 x i16> %insert, <16 x i16> addrspace(1)* null diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_s_v2i8_s_s: @@ -83,6 +84,29 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 1 +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s0 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -163,6 +187,28 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -245,6 +291,28 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -330,6 +398,29 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -412,6 +503,28 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v2i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_load_u16 v2, v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -492,6 +605,28 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -569,6 +704,28 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -646,6 +803,28 @@ ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v2i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx store <2 x i8> %insert, <2 x i8> addrspace(1)* null @@ -850,6 +1029,46 @@ ; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_and_b32 s0, s3, 3 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s2, 0xff, s0 +; GFX11-NEXT: s_lshl_b32 s0, s1, s0 +; GFX11-NEXT: s_not_b32 s1, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v2, v3 +; GFX11-NEXT: v_and_or_b32 v0, v0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, 0xff, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -996,6 +1215,44 @@ ; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v4i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: s_and_b32 s1, s4, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_lshl_b32 s1, s1, 3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-NEXT: s_and_b32 s3, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s3, s3, s4 +; GFX11-NEXT: s_lshl_b32 s2, s2, 24 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_lshl_b32 s3, 0xff, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s0, s0, s3 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, v0, 0xff, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1146,6 +1403,47 @@ ; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v4i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_and_b32 s1, s4, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-NEXT: s_and_b32 s2, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, v0, 0xff, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1293,6 +1591,47 @@ ; GFX10-NEXT: v_or3_b32 v2, v4, v3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v4i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s1, s0, 24 +; GFX11-NEXT: s_and_b32 s2, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s3 +; GFX11-NEXT: s_lshl_b32 s1, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s2, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s0, s0, s1 +; GFX11-NEXT: v_and_or_b32 v0, s0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, v0, 0xff, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1440,6 +1779,46 @@ ; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v1, 0xff +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v2, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v2 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX11-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, 0xff, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1584,6 +1963,46 @@ ; GFX10-NEXT: v_or3_b32 v2, v4, v2, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, s0, v2 +; GFX11-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v3, v4 +; GFX11-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, 0xff, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1728,6 +2147,47 @@ ; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v4i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v1, 3, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: v_lshlrev_b32_e64 v6, v1, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v6 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX11-NEXT: v_and_or_b32 v0, v0, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v1, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_or_b32 v4, 0xff, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx store <4 x i8> %insert, <4 x i8> addrspace(1)* null @@ -1995,6 +2455,72 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s2, s5, 2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-NEXT: s_and_b32 s7, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_and_b32 s9, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s8, s8, 8 +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s7, s7, s8 +; GFX11-NEXT: s_or_b32 s8, s9, s10 +; GFX11-NEXT: s_lshl_b32 s3, s3, 24 +; GFX11-NEXT: s_lshl_b32 s6, s6, 24 +; GFX11-NEXT: s_or_b32 s0, s7, s0 +; GFX11-NEXT: s_or_b32 s1, s8, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_or_b32 s1, s1, s6 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s3, s1, s0 +; GFX11-NEXT: s_and_b32 s5, s5, 3 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s6, 0xff, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, s5 +; GFX11-NEXT: s_and_not1_b32 s3, s3, s6 +; GFX11-NEXT: s_or_b32 s3, s3, s4 +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s0, s3, s0 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: s_cselect_b32 s1, s3, s1 +; GFX11-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-NEXT: s_and_b32 s3, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_and_b32 s6, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_or_b32 s3, s3, s4 +; GFX11-NEXT: s_lshl_b32 s4, s7, 8 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_or_b32 s3, s6, s4 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_lshl_b32 s2, s2, 24 +; GFX11-NEXT: s_or_b32 s1, s3, s1 +; GFX11-NEXT: s_lshl_b32 s3, s5, 24 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2224,6 +2750,66 @@ ; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s0, s3, 2 +; GFX11-NEXT: s_and_b32 s1, s3, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX11-NEXT: s_lshl_b32 s1, s1, 3 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s3, 0xff, s1 +; GFX11-NEXT: s_lshl_b32 s1, s2, s1 +; GFX11-NEXT: s_not_b32 s2, s3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v1, v1, v7, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or3_b32 v0, v0, v4, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v2, v2, s2, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2475,6 +3061,72 @@ ; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s2, s4, 2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s3, s0, 24 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_and_b32 s6, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_and_b32 s8, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_or_b32 s7, s8, s9 +; GFX11-NEXT: s_lshl_b32 s3, s3, 24 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_or_b32 s0, s6, s0 +; GFX11-NEXT: s_or_b32 s1, s7, s1 +; GFX11-NEXT: s_or_b32 s0, s0, s3 +; GFX11-NEXT: s_or_b32 s1, s1, s5 +; GFX11-NEXT: s_cmp_eq_u32 s2, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX11-NEXT: s_cselect_b32 s3, s1, s0 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_lshl_b32 s5, 0xff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s3, s3, s5 +; GFX11-NEXT: v_lshl_or_b32 v2, v2, s4, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2734,6 +3386,74 @@ ; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-NEXT: s_and_b32 s6, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_lshl_b32 s3, s3, 24 +; GFX11-NEXT: s_or_b32 s1, s6, s1 +; GFX11-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: v_and_b32_e32 v1, 3, v0 +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX11-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_lshl_b32 s2, s2, 24 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v1 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX11-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -2990,6 +3710,72 @@ ; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v8i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s3, s1, 24 +; GFX11-NEXT: s_and_b32 s6, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s6, s6, s7 +; GFX11-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX11-NEXT: s_lshl_b32 s3, s3, 24 +; GFX11-NEXT: s_or_b32 s1, s6, s1 +; GFX11-NEXT: s_lshr_b32 s2, s0, 24 +; GFX11-NEXT: s_and_b32 s4, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_and_b32 v2, 3, v1 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_lshl_b32 s2, s2, 24 +; GFX11-NEXT: s_or_b32 s0, s3, s0 +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v5, s0, v1 :: v_dual_lshlrev_b32 v2, 3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v2, 0xff +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v2, v5, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -3219,6 +4005,67 @@ ; GFX10-NEXT: v_or3_b32 v3, v8, v3, v6 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v3, 3, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_and_b32 s0, s2, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e64 v5, v3, 0xff +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX11-NEXT: v_or3_b32 v1, v1, v9, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v3, v5, v4, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v3, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -3445,6 +4292,70 @@ ; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s1, s2, 2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 +; GFX11-NEXT: s_and_b32 s0, s2, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; GFX11-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_not_b32 s0, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v6, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or3_b32 v1, v1, v8, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v4 +; GFX11-NEXT: v_or3_b32 v0, v0, v5, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo +; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -3671,6 +4582,66 @@ ; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v8i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v6 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX11-NEXT: v_or3_b32 v1, v1, v10, v9 +; GFX11-NEXT: v_and_b32_e32 v4, 3, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v5, v0, v1 :: v_dual_lshlrev_b32 v4, 3, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b32_e64 v6, v4, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_xor_b32_e32 v4, -1, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v2, v5, v4, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v5, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v0, v3 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v1, v5 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v2, v3, v4, v2 +; GFX11-NEXT: v_or3_b32 v3, v5, v7, v6 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx store <8 x i8> %insert, <8 x i8> addrspace(1)* null @@ -4138,6 +5109,121 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s11, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s13, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s7, s1, 24 +; GFX11-NEXT: s_and_b32 s10, s0, 0xff +; GFX11-NEXT: s_and_b32 s12, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_lshr_b32 s6, s0, 24 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s10, s10, s11 +; GFX11-NEXT: s_or_b32 s11, s12, s13 +; GFX11-NEXT: s_bfe_u32 s15, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshl_b32 s7, s7, 24 +; GFX11-NEXT: s_or_b32 s1, s11, s1 +; GFX11-NEXT: s_lshr_b32 s8, s2, 24 +; GFX11-NEXT: s_and_b32 s14, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s6, s6, 24 +; GFX11-NEXT: s_lshl_b32 s15, s15, 8 +; GFX11-NEXT: s_or_b32 s0, s10, s0 +; GFX11-NEXT: s_or_b32 s1, s1, s7 +; GFX11-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX11-NEXT: s_lshr_b32 s9, s3, 24 +; GFX11-NEXT: s_or_b32 s12, s14, s15 +; GFX11-NEXT: s_or_b32 s0, s0, s6 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s6, s8, 24 +; GFX11-NEXT: s_and_b32 s8, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s2, s12, s2 +; GFX11-NEXT: s_or_b32 s7, s8, s7 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s6 +; GFX11-NEXT: s_or_b32 s3, s7, s3 +; GFX11-NEXT: s_lshl_b32 s6, s9, 24 +; GFX11-NEXT: s_lshr_b32 s7, s5, 2 +; GFX11-NEXT: s_or_b32 s3, s3, s6 +; GFX11-NEXT: s_cmp_eq_u32 s7, 1 +; GFX11-NEXT: s_cselect_b32 s6, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s7, 2 +; GFX11-NEXT: s_cselect_b32 s6, s2, s6 +; GFX11-NEXT: s_cmp_eq_u32 s7, 3 +; GFX11-NEXT: s_cselect_b32 s6, s3, s6 +; GFX11-NEXT: s_and_b32 s5, s5, 3 +; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s8, 0xff, s5 +; GFX11-NEXT: s_lshl_b32 s4, s4, s5 +; GFX11-NEXT: s_and_not1_b32 s5, s6, s8 +; GFX11-NEXT: s_or_b32 s4, s5, s4 +; GFX11-NEXT: s_cmp_eq_u32 s7, 0 +; GFX11-NEXT: s_cselect_b32 s0, s4, s0 +; GFX11-NEXT: s_cmp_eq_u32 s7, 1 +; GFX11-NEXT: s_cselect_b32 s1, s4, s1 +; GFX11-NEXT: s_cmp_eq_u32 s7, 2 +; GFX11-NEXT: s_cselect_b32 s2, s4, s2 +; GFX11-NEXT: s_cmp_eq_u32 s7, 3 +; GFX11-NEXT: s_cselect_b32 s3, s4, s3 +; GFX11-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX11-NEXT: s_lshr_b32 s4, s0, 24 +; GFX11-NEXT: s_and_b32 s8, s0, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_or_b32 s7, s8, s7 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_or_b32 s0, s7, s0 +; GFX11-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX11-NEXT: s_lshl_b32 s4, s4, 24 +; GFX11-NEXT: s_and_b32 s9, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s7, s7, 8 +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_or_b32 s7, s9, s7 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s0, s0, s4 +; GFX11-NEXT: s_lshl_b32 s4, s5, 24 +; GFX11-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX11-NEXT: s_lshr_b32 s6, s2, 24 +; GFX11-NEXT: s_or_b32 s1, s7, s1 +; GFX11-NEXT: s_and_b32 s7, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s5, s5, 8 +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_or_b32 s5, s7, s5 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_or_b32 s1, s1, s4 +; GFX11-NEXT: s_bfe_u32 s4, s3, 0x80008 +; GFX11-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_and_b32 s5, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s4, s4, 8 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s4, s5, s4 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_lshl_b32 s5, s6, 24 +; GFX11-NEXT: s_or_b32 s3, s4, s3 +; GFX11-NEXT: s_lshl_b32 s4, s8, 24 +; GFX11-NEXT: s_or_b32 s2, s2, s5 +; GFX11-NEXT: s_or_b32 s3, s3, s4 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -4507,6 +5593,103 @@ ; GFX10-NEXT: v_or3_b32 v3, v3, v16, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i8_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s4, s3, 2 +; GFX11-NEXT: s_and_b32 s1, s3, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX11-NEXT: s_lshl_b32 s3, s1, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_lshl_b32 s5, 0xff, s3 +; GFX11-NEXT: s_lshl_b32 s2, s2, s3 +; GFX11-NEXT: s_not_b32 s3, s5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GFX11-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GFX11-NEXT: v_and_or_b32 v1, v1, 0xff, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or3_b32 v1, v1, v11, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX11-NEXT: v_bfe_u32 v15, v3, 16, 8 +; GFX11-NEXT: v_and_or_b32 v0, v0, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v14 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v12 +; GFX11-NEXT: v_or3_b32 v0, v0, v9, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v8 +; GFX11-NEXT: v_or3_b32 v2, v2, v13, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v3, v3, v10, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 +; GFX11-NEXT: v_and_or_b32 v4, v4, s3, s2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v4, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GFX11-NEXT: v_bfe_u32 v15, v3, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 24, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v8 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v4 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v5 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v0, v0, v9, v16 +; GFX11-NEXT: v_or3_b32 v1, v1, v10, v11 +; GFX11-NEXT: v_or3_b32 v2, v2, v12, v6 +; GFX11-NEXT: v_or3_b32 v3, v3, v14, v7 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -4942,6 +6125,115 @@ ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-NEXT: s_and_b32 s9, s0, 0xff +; GFX11-NEXT: s_and_b32 s11, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s10, s10, 8 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s9, s9, s10 +; GFX11-NEXT: s_or_b32 s10, s11, s12 +; GFX11-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_lshl_b32 s6, s6, 24 +; GFX11-NEXT: s_or_b32 s1, s10, s1 +; GFX11-NEXT: s_lshr_b32 s7, s2, 24 +; GFX11-NEXT: s_and_b32 s13, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_or_b32 s0, s9, s0 +; GFX11-NEXT: s_or_b32 s1, s1, s6 +; GFX11-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX11-NEXT: s_lshr_b32 s8, s3, 24 +; GFX11-NEXT: s_or_b32 s11, s13, s14 +; GFX11-NEXT: s_or_b32 s0, s0, s5 +; GFX11-NEXT: s_lshl_b32 s2, s2, 16 +; GFX11-NEXT: s_lshl_b32 s5, s7, 24 +; GFX11-NEXT: s_and_b32 s7, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s6, s6, 8 +; GFX11-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s2, s11, s2 +; GFX11-NEXT: s_or_b32 s6, s7, s6 +; GFX11-NEXT: s_lshl_b32 s3, s3, 16 +; GFX11-NEXT: s_or_b32 s2, s2, s5 +; GFX11-NEXT: s_or_b32 s3, s6, s3 +; GFX11-NEXT: s_lshl_b32 s5, s8, 24 +; GFX11-NEXT: s_lshr_b32 s6, s4, 2 +; GFX11-NEXT: s_or_b32 s3, s3, s5 +; GFX11-NEXT: s_cmp_eq_u32 s6, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_cselect_b32 s5, s1, s0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 2 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX11-NEXT: s_cselect_b32 s5, s2, s5 +; GFX11-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-NEXT: s_cselect_b32 s5, s3, s5 +; GFX11-NEXT: s_and_b32 s4, s4, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s4, s4, 3 +; GFX11-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 s5, s5, s7 +; GFX11-NEXT: v_lshl_or_b32 v4, v0, s4, s5 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v4 :: v_dual_lshlrev_b32 v6, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v11, v2, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v5 +; GFX11-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-NEXT: v_bfe_u32 v12, v2, 16, 8 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX11-NEXT: v_or3_b32 v0, v0, v7, v4 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v1, v1, v8, v6 +; GFX11-NEXT: v_or3_b32 v2, v2, v11, v9 +; GFX11-NEXT: v_or3_b32 v3, v3, v12, v10 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -5383,6 +6675,117 @@ ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xff +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_and_b32 s8, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_lshr_b32 s6, s1, 24 +; GFX11-NEXT: s_and_b32 s10, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s12, s12, 8 +; GFX11-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s9, s10, s12 +; GFX11-NEXT: s_lshr_b32 s7, s2, 24 +; GFX11-NEXT: s_and_b32 s13, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_lshl_b32 s14, s14, 8 +; GFX11-NEXT: s_or_b32 s0, s8, s0 +; GFX11-NEXT: s_lshl_b32 s6, s6, 24 +; GFX11-NEXT: s_or_b32 s1, s9, s1 +; GFX11-NEXT: s_or_b32 s10, s13, s14 +; GFX11-NEXT: s_or_b32 s8, s0, s5 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: s_or_b32 s9, s1, s6 +; GFX11-NEXT: s_or_b32 s0, s10, s0 +; GFX11-NEXT: s_lshl_b32 s1, s7, 24 +; GFX11-NEXT: s_bfe_u32 s2, s3, 0x80008 +; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_and_b32 s5, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_or_b32 s10, s0, s1 +; GFX11-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s0, s5, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX11-NEXT: s_or_b32 s1, s0, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX11-NEXT: s_lshr_b32 s11, s3, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshl_b32 s2, s11, 24 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 +; GFX11-NEXT: s_or_b32 s11, s1, s2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX11-NEXT: s_and_b32 s2, s4, 0xff +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v5, v1, v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX11-NEXT: v_bfe_u32 v11, v2, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-NEXT: v_bfe_u32 v12, v2, 16, 8 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v4 +; GFX11-NEXT: v_bfe_u32 v4, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX11-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_or3_b32 v1, v1, v8, v6 +; GFX11-NEXT: v_or3_b32 v2, v2, v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v3, v3, v12, v10 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -5821,6 +7224,118 @@ ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_s_v16i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, 0xff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX11-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX11-NEXT: s_lshr_b32 s4, s0, 24 +; GFX11-NEXT: s_and_b32 s8, s0, 0xff +; GFX11-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX11-NEXT: s_lshl_b32 s9, s9, 8 +; GFX11-NEXT: s_lshr_b32 s5, s1, 24 +; GFX11-NEXT: s_and_b32 s10, s1, 0xff +; GFX11-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX11-NEXT: s_lshl_b32 s11, s11, 8 +; GFX11-NEXT: s_bfe_u32 s13, s2, 0x80008 +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_or_b32 s8, s8, s9 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: s_or_b32 s9, s10, s11 +; GFX11-NEXT: s_lshr_b32 s6, s2, 24 +; GFX11-NEXT: s_and_b32 s12, s2, 0xff +; GFX11-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX11-NEXT: s_lshl_b32 s4, s4, 24 +; GFX11-NEXT: s_lshl_b32 s13, s13, 8 +; GFX11-NEXT: s_or_b32 s0, s8, s0 +; GFX11-NEXT: s_lshl_b32 s5, s5, 24 +; GFX11-NEXT: s_or_b32 s1, s9, s1 +; GFX11-NEXT: s_or_b32 s10, s12, s13 +; GFX11-NEXT: s_or_b32 s4, s0, s4 +; GFX11-NEXT: s_lshl_b32 s0, s2, 16 +; GFX11-NEXT: s_or_b32 s5, s1, s5 +; GFX11-NEXT: s_or_b32 s0, s10, s0 +; GFX11-NEXT: s_lshl_b32 s1, s6, 24 +; GFX11-NEXT: s_bfe_u32 s2, s3, 0x80008 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: s_and_b32 s8, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s2, s2, 8 +; GFX11-NEXT: s_or_b32 s6, s0, s1 +; GFX11-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX11-NEXT: s_or_b32 s0, s8, s2 +; GFX11-NEXT: s_lshl_b32 s1, s1, 16 +; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc_lo +; GFX11-NEXT: s_or_b32 s1, s0, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 +; GFX11-NEXT: s_lshr_b32 s7, s3, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshl_b32 s2, s7, 24 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 +; GFX11-NEXT: s_or_b32 s7, s1, s2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 +; GFX11-NEXT: v_xor_b32_e32 v1, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v4, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 8 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 8 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v11, v2, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v4 +; GFX11-NEXT: v_bfe_u32 v4, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v12, v2, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX11-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_or3_b32 v1, v1, v8, v6 +; GFX11-NEXT: v_or3_b32 v2, v2, v11, v9 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v3, v3, v12, v10 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -6190,6 +7705,104 @@ ; GFX10-NEXT: v_or3_b32 v3, v14, v7, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i8_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v2 +; GFX11-NEXT: s_and_b32 s1, s2, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v10, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 8 +; GFX11-NEXT: v_bfe_u32 v13, v4, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v5, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX11-NEXT: v_bfe_u32 v15, v5, 16, 8 +; GFX11-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX11-NEXT: v_and_or_b32 v3, v3, 0xff, v10 +; GFX11-NEXT: v_and_or_b32 v4, v4, 0xff, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX11-NEXT: v_bfe_u32 v17, v6, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v16 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v5, v14 +; GFX11-NEXT: v_or3_b32 v2, v3, v11, v2 +; GFX11-NEXT: v_or3_b32 v3, v4, v13, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GFX11-NEXT: v_and_or_b32 v6, 0xff, v6, v10 +; GFX11-NEXT: v_or3_b32 v5, v5, v15, v8 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v2, v3 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v4, v6, v12, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v7, v5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, 0xff +; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v4, s1 +; GFX11-NEXT: v_and_or_b32 v0, v6, v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, v0, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX11-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v10, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v11, v2, 16, 8 +; GFX11-NEXT: v_bfe_u32 v12, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v15, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 24, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v8 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v4 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v5 +; GFX11-NEXT: v_and_or_b32 v8, 0xff, v0, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v0, v1, v9, v16 +; GFX11-NEXT: v_or3_b32 v1, v2, v10, v11 +; GFX11-NEXT: v_or3_b32 v2, v3, v12, v6 +; GFX11-NEXT: v_or3_b32 v3, v8, v14, v7 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -6556,6 +8169,104 @@ ; GFX10-NEXT: v_or3_b32 v3, v12, v16, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i8_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off +; GFX11-NEXT: s_lshr_b32 s3, s2, 2 +; GFX11-NEXT: s_and_b32 s1, s2, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_lshl_b32 s2, s1, 3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX11-NEXT: s_lshl_b32 s2, 0xff, s2 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v11, v4, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX11-NEXT: v_bfe_u32 v9, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v12, v4, 16, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-NEXT: v_bfe_u32 v10, v3, 16, 8 +; GFX11-NEXT: v_bfe_u32 v13, v5, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX11-NEXT: v_and_or_b32 v4, v4, 0xff, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; GFX11-NEXT: v_bfe_u32 v14, v5, 16, 8 +; GFX11-NEXT: v_bfe_u32 v15, v6, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-NEXT: v_and_or_b32 v3, v3, 0xff, v9 +; GFX11-NEXT: v_or3_b32 v1, v4, v12, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v6 +; GFX11-NEXT: v_bfe_u32 v16, v6, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 8, v15 +; GFX11-NEXT: v_and_or_b32 v5, 0xff, v5, v13 +; GFX11-NEXT: v_or3_b32 v0, v3, v10, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v8 +; GFX11-NEXT: v_and_or_b32 v4, 0xff, v6, v9 +; GFX11-NEXT: v_or3_b32 v5, v5, v14, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or3_b32 v3, v4, v11, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v6, v5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1 +; GFX11-NEXT: v_and_or_b32 v2, v4, s2, v2 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v2, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, v2, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v2, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-NEXT: v_bfe_u32 v8, v0, 8, 8 +; GFX11-NEXT: v_bfe_u32 v10, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v4, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 8 +; GFX11-NEXT: v_bfe_u32 v11, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v15, v2, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v8 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v10 +; GFX11-NEXT: v_and_or_b32 v8, 0xff, v4, v5 +; GFX11-NEXT: v_and_or_b32 v10, 0xff, v2, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v0, v0, v9, v3 +; GFX11-NEXT: v_or3_b32 v1, v1, v11, v16 +; GFX11-NEXT: v_or3_b32 v2, v8, v12, v6 +; GFX11-NEXT: v_or3_b32 v3, v10, v14, v7 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null @@ -6922,6 +8633,104 @@ ; GFX10-NEXT: v_or3_b32 v3, v14, v8, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: insertelement_v_v16i8_v_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 3, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfe_u32 v11, v4, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v5, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX11-NEXT: v_bfe_u32 v12, v4, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v5, 16, 8 +; GFX11-NEXT: v_bfe_u32 v15, v6, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX11-NEXT: v_bfe_u32 v16, v6, 16, 8 +; GFX11-NEXT: v_bfe_u32 v17, v7, 8, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX11-NEXT: v_and_or_b32 v4, v4, 0xff, v11 +; GFX11-NEXT: v_and_or_b32 v5, v5, 0xff, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 24, v7 +; GFX11-NEXT: v_bfe_u32 v18, v7, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 8, v17 +; GFX11-NEXT: v_and_or_b32 v6, 0xff, v6, v15 +; GFX11-NEXT: v_or3_b32 v3, v4, v12, v3 +; GFX11-NEXT: v_or3_b32 v4, v5, v14, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX11-NEXT: v_and_or_b32 v7, 0xff, v7, v11 +; GFX11-NEXT: v_or3_b32 v6, v6, v16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e64 v9, v0, 0xff +; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_or3_b32 v5, v7, v13, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v8, v6, s0 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v5, s1 +; GFX11-NEXT: v_and_or_b32 v0, v7, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, v0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v0, s1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX11-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX11-NEXT: v_bfe_u32 v10, v2, 8, 8 +; GFX11-NEXT: v_bfe_u32 v11, v2, 16, 8 +; GFX11-NEXT: v_bfe_u32 v12, v3, 8, 8 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 8 +; GFX11-NEXT: v_bfe_u32 v14, v0, 8, 8 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 8 +; GFX11-NEXT: v_bfe_u32 v15, v0, 16, 8 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 24, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 24, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 8, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v8 +; GFX11-NEXT: v_and_or_b32 v2, 0xff, v2, v4 +; GFX11-NEXT: v_and_or_b32 v3, 0xff, v3, v5 +; GFX11-NEXT: v_and_or_b32 v8, 0xff, v0, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: v_or3_b32 v0, v1, v9, v16 +; GFX11-NEXT: v_or3_b32 v1, v2, v10, v11 +; GFX11-NEXT: v_or3_b32 v2, v3, v12, v6 +; GFX11-NEXT: v_or3_b32 v3, v8, v14, v7 +; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx store <16 x i8> %insert, <16 x i8> addrspace(1)* null diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, <64 x i32> addrspace(1)* %ptr.out) #0 { ; GCN-LABEL: v_insert_v64i32_37: @@ -97,6 +98,59 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_insert_v64i32_37: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v64, 8, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: global_load_b128 v[32:35], v64, s[0:1] +; GFX11-NEXT: global_load_b128 v[36:39], v64, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[40:43], v64, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[44:47], v64, s[0:1] offset:48 +; GFX11-NEXT: global_load_b128 v[48:51], v64, s[0:1] offset:64 +; GFX11-NEXT: global_load_b128 v[52:55], v64, s[0:1] offset:80 +; GFX11-NEXT: global_load_b128 v[56:59], v64, s[0:1] offset:96 +; GFX11-NEXT: global_load_b128 v[60:63], v64, s[0:1] offset:112 +; GFX11-NEXT: global_load_b128 v[4:7], v64, s[0:1] offset:144 +; GFX11-NEXT: global_load_b128 v[0:3], v64, s[0:1] offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v5, 0x3e7 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_load_b128 v[8:11], v64, s[0:1] offset:160 +; GFX11-NEXT: global_load_b128 v[12:15], v64, s[0:1] offset:176 +; GFX11-NEXT: global_load_b128 v[16:19], v64, s[0:1] offset:192 +; GFX11-NEXT: global_load_b128 v[20:23], v64, s[0:1] offset:208 +; GFX11-NEXT: global_load_b128 v[24:27], v64, s[0:1] offset:224 +; GFX11-NEXT: global_load_b128 v[28:31], v64, s[0:1] offset:240 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v64, v[0:3], s[2:3] offset:128 +; GFX11-NEXT: global_store_b128 v64, v[4:7], s[2:3] offset:144 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: global_store_b128 v64, v[8:11], s[2:3] offset:160 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: global_store_b128 v64, v[12:15], s[2:3] offset:176 +; GFX11-NEXT: global_store_b128 v64, v[32:35], s[2:3] +; GFX11-NEXT: global_store_b128 v64, v[36:39], s[2:3] offset:16 +; GFX11-NEXT: global_store_b128 v64, v[40:43], s[2:3] offset:32 +; GFX11-NEXT: global_store_b128 v64, v[44:47], s[2:3] offset:48 +; GFX11-NEXT: global_store_b128 v64, v[48:51], s[2:3] offset:64 +; GFX11-NEXT: global_store_b128 v64, v[52:55], s[2:3] offset:80 +; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:96 +; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: global_store_b128 v64, v[16:19], s[2:3] offset:192 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: global_store_b128 v64, v[20:23], s[2:3] offset:208 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: global_store_b128 v64, v[24:27], s[2:3] offset:224 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v64, v[28:31], s[2:3] offset:240 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id %vec = load <64 x i32>, <64 x i32> addrspace(1)* %gep.in diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-local.mir @@ -4,6 +4,7 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s --- @@ -41,6 +42,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[DS_CMPST_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_CMPST_RTN_B32_gfx9_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s32_local + ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPSTORE_RTN_B32_gfx9 [[COPY]], [[COPY2]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 3) + ; GFX11-NEXT: $vgpr0 = COPY [[DS_CMPSTORE_RTN_B32_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -86,6 +95,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[DS_CMPST_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 4, 0, implicit $exec :: (load store seq_cst (s32), addrspace 3) ; GFX9-NEXT: $vgpr0 = COPY [[DS_CMPST_RTN_B32_gfx9_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s32_local_gep4 + ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPSTORE_RTN_B32_gfx9 [[COPY]], [[COPY2]], [[COPY1]], 4, 0, implicit $exec :: (load store seq_cst (s32), addrspace 3) + ; GFX11-NEXT: $vgpr0 = COPY [[DS_CMPSTORE_RTN_B32_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -131,6 +148,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9-NEXT: [[DS_CMPST_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_gfx9_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s64_local + ; GFX11: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPSTORE_RTN_B64_gfx9 [[COPY]], [[COPY2]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 3) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPSTORE_RTN_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 @@ -174,6 +199,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9-NEXT: [[DS_CMPST_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 3) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_gfx9_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s64_local_gep4 + ; GFX11: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPSTORE_RTN_B64_gfx9 [[COPY]], [[COPY2]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 3) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPSTORE_RTN_B64_gfx9_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomic-cmpxchg-region.mir @@ -4,6 +4,7 @@ # RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX11 %s --- @@ -41,6 +42,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst (s32), addrspace 2) ; GFX9-NEXT: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s32_region + ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPSTORE_RTN_B32 [[COPY]], [[COPY2]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst (s32), addrspace 2) + ; GFX11-NEXT: $vgpr0 = COPY [[DS_CMPSTORE_RTN_B32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -86,6 +95,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9-NEXT: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst (s32), addrspace 2) ; GFX9-NEXT: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s32_region_gep4 + ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPSTORE_RTN_B32 [[COPY]], [[COPY2]], [[COPY1]], 4, 1, implicit $m0, implicit $exec :: (load store seq_cst (s32), addrspace 2) + ; GFX11-NEXT: $vgpr0 = COPY [[DS_CMPSTORE_RTN_B32_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -131,6 +148,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9-NEXT: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst (s64), addrspace 2) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s64_region + ; GFX11: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPSTORE_RTN_B64 [[COPY]], [[COPY2]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst (s64), addrspace 2) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPSTORE_RTN_B64_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 @@ -174,6 +199,14 @@ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 ; GFX9-NEXT: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst (s64), addrspace 2) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]] + ; GFX11-LABEL: name: atomic_cmpxchg_s64_region_gep4 + ; GFX11: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4 + ; GFX11-NEXT: [[DS_CMPSTORE_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPSTORE_RTN_B64 [[COPY]], [[COPY2]], [[COPY1]], 0, 1, implicit $m0, implicit $exec :: (load store seq_cst (s64), addrspace 2) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[DS_CMPSTORE_RTN_B64_]] %0:vgpr(p2) = COPY $vgpr0 %1:vgpr(s64) = COPY $vgpr1_vgpr2 %2:vgpr(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --- name: fract_f64_neg diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir @@ -3,6 +3,7 @@ # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX8 %s # RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -amdgpu-global-isel-new-legality -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmad.s16.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX6 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX10 %s --- name: test_fmad_s16_flush diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s ; FIXME: Merge with other test. DS offset folding doesn't work due to ; register bank copies, and no return optimization is missing. @@ -74,6 +75,20 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0 store i32 %result, i32 addrspace(1)* %out ret void @@ -139,6 +154,20 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -183,6 +212,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: ds_inc_u32 v0, v1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_noret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: ds_inc_u32 v0, v1 +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -225,6 +262,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: ds_inc_u32 v1, v0 offset:16 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16 +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) ret void @@ -280,6 +325,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out ret void @@ -339,6 +395,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32 addrspace(1)* %out @@ -383,6 +450,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_noret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -429,6 +505,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) ret void @@ -498,6 +583,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id @@ -555,6 +651,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 @@ -624,6 +729,20 @@ ; GFX10-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-NEXT: global_store_dword v2, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_shl_base_lds_0_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 @@ -693,6 +812,21 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_ret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void @@ -758,6 +892,21 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -806,6 +955,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: ds_inc_u64 v2, v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_noret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: ds_inc_u64 v2, v[0:1] +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -852,6 +1010,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: ds_inc_u64 v2, v[0:1] offset:32 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lds_atomic_inc_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: ds_inc_u64 v2, v[0:1] offset:32 +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) ret void @@ -911,6 +1078,18 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_ret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out ret void @@ -974,6 +1153,18 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64 addrspace(1)* %out @@ -1022,6 +1213,16 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_noret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -1072,6 +1273,16 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) ret void @@ -1145,6 +1356,18 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 42 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id @@ -1206,6 +1429,16 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 @@ -1227,6 +1460,19 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 glc +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32* %out ret void @@ -1294,6 +1540,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_ret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, i32* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, i32* %out @@ -1310,6 +1569,16 @@ ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_atomic_inc v[0:1], v2 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_noret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) ret void } @@ -1360,6 +1629,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_atomic_inc v[0:1], v2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_noret_i32_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i32, i32* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) ret void @@ -1447,6 +1726,25 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dword v[0:1], v3 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b32 v[0:1], v3 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id %out.gep = getelementptr i32, i32* %out, i32 %id @@ -1514,6 +1812,20 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_atomic_inc v[0:1], v2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id %gep = getelementptr i32, i32* %gep.tid, i32 5 @@ -1587,6 +1899,21 @@ ; GFX10-NEXT: global_store_dword v3, v0, s[2:3] ; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_inc_shl_base_lds_0_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 9 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 @@ -1611,6 +1938,20 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_ret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64* %out ret void @@ -1682,6 +2023,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_ret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, i64* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, i64* %out @@ -1699,6 +2054,17 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_noret_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) ret void } @@ -1753,6 +2119,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_noret_i64_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i64, i64* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) ret void @@ -1844,6 +2221,27 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc +; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id %out.gep = getelementptr i64, i64* %out, i32 %id @@ -1915,6 +2313,21 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 42 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id %gep = getelementptr i64, i64* %gep.tid, i32 5 @@ -1993,6 +2406,22 @@ ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: nocse_lds_atomic_inc_ret_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -verify-machineinstrs < %s | FileCheck %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -3,6 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10_W32 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11_W32 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX11_W64 %s define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) { ; GFX7-LABEL: v_div_fmas_f32: @@ -40,6 +42,24 @@ ; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 ; GFX10_W64-NEXT: s_setpc_b64 s[30:31] +; +; GFX11_W32-LABEL: v_div_fmas_f32: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11_W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11_W32-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GFX11_W32-NEXT: s_setpc_b64 s[30:31] +; +; GFX11_W64-LABEL: v_div_fmas_f32: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11_W64-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11_W64-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GFX11_W64-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) ret float %result } @@ -80,6 +100,24 @@ ; GFX10_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10_W64-NEXT: s_setpc_b64 s[30:31] +; +; GFX11_W32-LABEL: v_div_fmas_f64: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11_W32-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11_W32-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11_W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6 +; GFX11_W32-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11_W32-NEXT: s_setpc_b64 s[30:31] +; +; GFX11_W64-LABEL: v_div_fmas_f64: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11_W64-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11_W64-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX11_W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11_W64-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) ret double %result } @@ -132,6 +170,27 @@ ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s0, v0, v1 ; GFX10_W64-NEXT: ; return to shader part epilog +; +; GFX11_W32-LABEL: s_div_fmas_f32: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s2 +; GFX11_W32-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11_W32-NEXT: s_and_b32 s3, 1, s3 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s0, v0, v1 +; GFX11_W32-NEXT: ; return to shader part epilog +; +; GFX11_W64-LABEL: s_div_fmas_f32: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s1 +; GFX11_W64-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s2 +; GFX11_W64-NEXT: s_and_b32 s3, 1, s3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s3 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s0, v0, v1 +; GFX11_W64-NEXT: ; return to shader part epilog %vcc = icmp eq i32 %d, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %vcc) ret float %result @@ -203,6 +262,34 @@ ; GFX10_W64-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10_W64-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10_W64-NEXT: ; return to shader part epilog +; +; GFX11_W32-LABEL: s_div_fmas_f64: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11_W32-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11_W32-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11_W32-NEXT: s_and_b32 s6, 1, s6 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX11_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] +; GFX11_W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11_W32-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11_W32-NEXT: ; return to shader part epilog +; +; GFX11_W64-LABEL: s_div_fmas_f64: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11_W64-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11_W64-NEXT: s_and_b32 s6, 1, s6 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 +; GFX11_W64-NEXT: v_mov_b32_e32 v3, s5 +; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] +; GFX11_W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11_W64-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11_W64-NEXT: ; return to shader part epilog %vcc = icmp eq i32 %d, 0 %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %vcc) ret double %result @@ -284,6 +371,43 @@ ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x4 +; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x94 +; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x70 +; GFX11_W32-NEXT: s_load_b32 s5, s[0:1], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x4 +; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x94 +; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x70 +; GFX11_W64-NEXT: s_load_b32 s5, s[0:1], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s4 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 ret void @@ -357,6 +481,40 @@ ; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_0: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x3 +; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x94 +; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x4c +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_0: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x3 +; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x94 +; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x4c +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 ret void @@ -430,6 +588,40 @@ ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_1: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x3 +; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x34 +; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x10 +; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_1: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x3 +; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x34 +; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x10 +; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x8 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) store float %result, float addrspace(1)* %out, align 4 ret void @@ -503,6 +695,40 @@ ; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_inline_imm_2: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x3 +; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x94 +; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_inline_imm_2: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x3 +; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x94 +; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x4c +; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s3 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) store float %result, float addrspace(1)* %out, align 4 ret void @@ -582,6 +808,40 @@ ; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX10_W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f64: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x1 +; GFX11_W32-NEXT: s_load_b32 s8, s[0:1], 0x20 +; GFX11_W32-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_and_b32 s8, 1, s8 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 +; GFX11_W32-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] +; GFX11_W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11_W32-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f64: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x1 +; GFX11_W64-NEXT: s_load_b32 s8, s[0:1], 0x20 +; GFX11_W64-NEXT: s_load_b256 s[0:7], s[0:1], 0x0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_and_b32 s8, 1, s8 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11_W64-NEXT: v_mov_b32_e32 v2, s6 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX11_W64-NEXT: v_mov_b32_e32 v3, s7 +; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] +; GFX11_W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11_W64-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) store double %result, double addrspace(1)* %out, align 8 ret void @@ -659,6 +919,41 @@ ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_cond_to_vcc: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x1 +; GFX11_W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_cmp_eq_u32 s7, 0 +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; GFX11_W32-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11_W32-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_cond_to_vcc: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x1 +; GFX11_W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x8 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_cmp_eq_u32 s7, 0 +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s5 +; GFX11_W64-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s6 +; GFX11_W64-NEXT: s_and_b32 s2, 1, s2 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) store float %result, float addrspace(1)* %out, align 4 @@ -731,6 +1026,39 @@ ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x3 +; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x3 +; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_mov_b64 vcc, 0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) store float %result, float addrspace(1)* %out, align 4 ret void @@ -802,6 +1130,39 @@ ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_clause 0x3 +; GFX11_W32-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11_W32-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11_W32-NEXT: s_load_b32 s4, s[0:1], 0x28 +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W32-NEXT: s_mov_b32 vcc_lo, -1 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_clause 0x3 +; GFX11_W64-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11_W64-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11_W64-NEXT: s_load_b32 s4, s[0:1], 0x28 +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11_W64-NEXT: s_mov_b64 vcc, -1 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) store float %result, float addrspace(1)* %out, align 4 ret void @@ -916,6 +1277,54 @@ ; GFX10_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v4 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[4:5] offset:8 ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc: +; GFX11_W32: ; %bb.0: +; GFX11_W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: global_load_b32 v2, v1, s[6:7] glc dlc +; GFX11_W32-NEXT: s_waitcnt vmcnt(0) +; GFX11_W32-NEXT: global_load_b32 v3, v1, s[6:7] offset:4 glc dlc +; GFX11_W32-NEXT: s_waitcnt vmcnt(0) +; GFX11_W32-NEXT: global_load_b32 v1, v1, s[6:7] offset:8 glc dlc +; GFX11_W32-NEXT: s_waitcnt vmcnt(0) +; GFX11_W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11_W32-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11_W32-NEXT: s_and_b32 s0, 1, s0 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11_W32-NEXT: s_and_b32 vcc_lo, vcc_lo, s0 +; GFX11_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[4:5] offset:8 +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc: +; GFX11_W64: ; %bb.0: +; GFX11_W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: global_load_b32 v2, v1, s[6:7] glc dlc +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) +; GFX11_W64-NEXT: global_load_b32 v3, v1, s[6:7] offset:4 glc dlc +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) +; GFX11_W64-NEXT: global_load_b32 v1, v1, s[6:7] offset:8 glc dlc +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) +; GFX11_W64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11_W64-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11_W64-NEXT: s_and_b32 s0, 1, s0 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX11_W64-NEXT: s_and_b64 vcc, vcc, s[0:1] +; GFX11_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[4:5] offset:8 +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1 @@ -1065,6 +1474,66 @@ ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] offset:8 ; GFX10_W64-NEXT: s_endpgm +; +; GFX11_W32-LABEL: test_div_fmas_f32_i1_phi_vcc: +; GFX11_W32: ; %bb.0: ; %entry +; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 +; GFX11_W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W32-NEXT: s_mov_b32 s5, 0 +; GFX11_W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: global_load_b96 v[1:3], v1, s[2:3] +; GFX11_W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11_W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11_W32-NEXT: s_cbranch_execz .LBB13_2 +; GFX11_W32-NEXT: ; %bb.1: ; %bb +; GFX11_W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x50 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11_W32-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11_W32-NEXT: .LBB13_2: ; %exit +; GFX11_W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11_W32-NEXT: s_and_b32 s0, 1, s5 +; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11_W32-NEXT: s_waitcnt vmcnt(0) +; GFX11_W32-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W32-NEXT: global_store_b32 v1, v0, s[2:3] offset:8 +; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W32-NEXT: s_endpgm +; +; GFX11_W64-LABEL: test_div_fmas_f32_i1_phi_vcc: +; GFX11_W64: ; %bb.0: ; %entry +; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x28 +; GFX11_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX11_W64-NEXT: s_mov_b32 s6, 0 +; GFX11_W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: global_load_b96 v[1:3], v1, s[2:3] +; GFX11_W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX11_W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11_W64-NEXT: s_cbranch_execz .LBB13_2 +; GFX11_W64-NEXT: ; %bb.1: ; %bb +; GFX11_W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x50 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11_W64-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11_W64-NEXT: .LBB13_2: ; %exit +; GFX11_W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX11_W64-NEXT: s_waitcnt vmcnt(0) +; GFX11_W64-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11_W64-NEXT: global_store_b32 v1, v0, s[2:3] offset:8 +; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11_W64-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr float, float addrspace(1)* %out, i32 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) { @@ -58,6 +59,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -125,6 +141,21 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -197,6 +228,22 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -269,6 +316,22 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1 @@ -331,6 +394,19 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_scalar_num_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x54 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -391,6 +467,19 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_scalar_num_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -451,6 +540,19 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_scalar_den_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -511,6 +613,19 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_scalar_den_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -571,6 +686,20 @@ ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_scalar_num_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -631,6 +760,20 @@ ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_scalar_num_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -691,6 +834,20 @@ ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_scalar_den_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -751,6 +908,20 @@ ; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_scalar_den_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -800,6 +971,19 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_all_scalar_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -844,6 +1028,19 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_all_scalar_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -890,6 +1087,19 @@ ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_all_scalar_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -936,6 +1146,19 @@ ; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_all_scalar_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 @@ -988,6 +1211,18 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_inline_imm_num: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %a = load float, float addrspace(1)* %gep.0, align 4 @@ -1044,6 +1279,18 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_inline_imm_den: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %a = load float, float addrspace(1)* %gep.0, align 4 @@ -1111,6 +1358,23 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_fabs_num: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -1183,6 +1447,23 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_fabs_den: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -1229,6 +1510,16 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_val_undef_val: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -1266,6 +1557,16 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_undef_val_val: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -1301,6 +1602,16 @@ ; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f32_undef_undef_val: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 store float %result0, float addrspace(1)* %out, align 4 @@ -1342,6 +1653,18 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_div_scale_f64_val_undef_val: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x40200000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) %result0 = extractvalue { double, i1 } %result, 0 store double %result0, double addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps float @ds_fadd_f32_ss(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fadd_f32_ss: @@ -28,6 +29,13 @@ ; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ds_fadd_f32_ss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -57,6 +65,13 @@ ; GFX10-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ds_fadd_f32_ss_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -84,6 +99,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: ds_add_f32 v0, v1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ds_fadd_f32_ss_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: ds_add_f32 v0, v1 +; GFX11-NEXT: s_endpgm %unused = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -110,6 +131,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: ds_add_f32 v1, v0 offset:512 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ds_fadd_f32_ss_offset_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: ds_add_f32 v1, v0 offset:512 +; GFX11-NEXT: s_endpgm %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -131,13 +158,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fadd_f32_vv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fadd_f32_vv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -158,13 +185,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fadd_f32_vv_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -186,13 +213,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fadd_f32_vv_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_add_f32 v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fadd_f32_vv_nortn: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_add_f32 v0, v1 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -213,13 +240,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fadd_f32_vv_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_add_f32 v0, v1 offset:512 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fadd_f32_vv_offset_nortn: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_add_f32 v0, v1 offset:512 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -241,13 +268,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fadd_f32_vv_volatile: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fadd_f32_vv_volatile: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps float @ds_fmin_f32_ss(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fmin_f32_ss: @@ -28,6 +29,13 @@ ; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ds_fmin_f32_ss: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -57,6 +65,13 @@ ; GFX10-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ds_fmin_f32_ss_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -84,6 +99,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: ds_min_f32 v0, v1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ds_fmin_f32_ss_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: ds_min_f32 v0, v1 +; GFX11-NEXT: s_endpgm %unused = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -110,6 +131,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: ds_min_f32 v1, v0 offset:512 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ds_fmin_f32_ss_offset_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: ds_min_f32 v1, v0 offset:512 +; GFX11-NEXT: s_endpgm %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -131,13 +158,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fmin_f32_vv: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fmin_f32_vv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret } @@ -158,13 +185,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fmin_f32_vv_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -186,13 +213,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fmin_f32_vv_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_min_f32 v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fmin_f32_vv_nortn: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_min_f32 v0, v1 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void } @@ -213,13 +240,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fmin_f32_vv_offset_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_min_f32 v0, v1 offset:512 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fmin_f32_vv_offset_nortn: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_min_f32 v0, v1 offset:512 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) ret void @@ -241,13 +268,13 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: ds_fmin_f32_vv_volatile: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: ds_fmin_f32_vv_volatile: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX10PLUS-NEXT: s_waitcnt lgkmcnt(0) +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true) ret float %ret } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.sema.v.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.sema.v.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.sema.v.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.gws.sema.v.ll @@ -3,3 +3,4 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.sema.v.ll | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %S/../llvm.amdgcn.ds.gws.sema.v.ll ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.sema.v.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %S/../llvm.amdgcn.ds.gws.sema.v.ll ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.sema.v.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %S/../llvm.amdgcn.ds.gws.sema.v.ll +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %S/../llvm.amdgcn.ds.gws.sema.v.ll | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %S/../llvm.amdgcn.ds.gws.sema.v.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -1,26 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { -; GCN-LABEL: test_wave32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_dword s1, s[4:5], 0x0 -; GCN-NEXT: s_load_dword s0, s[4:5], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %mid -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: .LBB0_2: ; %bb -; GCN-NEXT: s_waitcnt_depctr 0xffe3 -; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_endpgm +; GFX10-LABEL: test_wave32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB0_2 +; GFX10-NEXT: ; %bb.1: ; %mid +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: .LBB0_2: ; %bb +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_wave32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %mid +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB0_2: ; %bb +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 br i1 %cond, label %mid, label %bb diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fmul.legacy.ll @@ -4,6 +4,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX101 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX103 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define float @v_mul_legacy_f32(float %a, float %b) { ; GFX6-LABEL: v_mul_legacy_f32: @@ -37,6 +38,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) ret float %result } @@ -73,6 +81,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, s4, v0 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_undef0_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a) ret float %result } @@ -109,6 +124,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, s4, v0 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_undef1_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef) ret float %result } @@ -145,6 +167,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e64 v0, s4, s4 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_undef_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, s0, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float undef, float undef) ret float %result } @@ -181,6 +210,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e64 v0, |v0|, |v1| ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_fabs_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, |v0|, |v1| +; GFX11-NEXT: s_setpc_b64 s[30:31] %a.fabs = call float @llvm.fabs.f32(float %a) %b.fabs = call float @llvm.fabs.f32(float %b) %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs) @@ -219,6 +255,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e64 v0, -v0, -v1 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_fneg_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, -v0, -v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a %b.fneg = fneg float %b %result = call float @llvm.amdgcn.fmul.legacy(float %a.fneg, float %b.fneg) @@ -263,6 +306,15 @@ ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX103-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_add_mul_legacy_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) %add = fadd float %mul, %c ret float %add @@ -301,6 +353,15 @@ ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX103-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mad_legacy_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) %add = fadd float %mul, %c ret float %add @@ -339,6 +400,15 @@ ; GFX103-NEXT: v_mul_legacy_f32_e64 v0, -v0, -v1 ; GFX103-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mad_legacy_fneg_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, -v0, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg float %a %b.fneg = fneg float %b %mul = call float @llvm.amdgcn.fmul.legacy(float %a.fneg, float %b.fneg) @@ -374,6 +444,11 @@ ; GFX103: ; %bb.0: ; GFX103-NEXT: v_mul_legacy_f32_e64 v0, s0, s1 ; GFX103-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_mul_legacy_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) ret float %result } @@ -410,6 +485,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, 1.0, v0 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_f32_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float %a, float 1.0) ret float %result } @@ -446,6 +528,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, 1.0, v0 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_f32_1.0_swap: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float 1.0, float %b) ret float %result } @@ -482,6 +571,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_f32_2.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float %a, float 2.0) ret float %result } @@ -518,6 +614,13 @@ ; GFX103-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX103-NEXT: v_mul_legacy_f32_e32 v0, 2.0, v0 ; GFX103-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_legacy_f32_2.0_swap: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, 2.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.amdgcn.fmul.legacy(float 2.0, float %b) ret float %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.icmp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.icmp.ll @@ -1,36 +1,60 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -global-isel-abort=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -global-isel-abort=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -global-isel-abort=1 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps void @test_intr_icmp_eq_i64(i64 addrspace(1)* %out, i32 %src) #0 { -; GCN-LABEL: test_intr_icmp_eq_i64: -; GCN: ; %bb.0: -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, v2 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off -; GCN-NEXT: s_endpgm +; GFX10-LABEL: test_intr_icmp_eq_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_intr_icmp_eq_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %src, i32 100, i32 32) store i64 %result, i64 addrspace(1)* %out ret void } define amdgpu_ps void @test_intr_icmp_ne_i32(i32 addrspace(1)* %out, i32 %src) #1 { -; GCN-LABEL: test_intr_icmp_ne_i32: -; GCN: ; %bb.0: -; GCN-NEXT: v_cmp_ne_u32_e64 s0, 0x64, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: global_store_dword v[0:1], v2, off -; GCN-NEXT: s_endpgm +; GFX10-LABEL: test_intr_icmp_ne_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0x64, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_intr_icmp_ne_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 33) store i32 %result, i32 addrspace(1)* %out ret void } define amdgpu_ps void @test_intr_icmp_i32_invalid_cc(i32 addrspace(1)* %out, i32 %src) #1 { -; GCN-LABEL: test_intr_icmp_i32_invalid_cc: -; GCN: ; %bb.0: -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; GFX10-LABEL: test_intr_icmp_i32_invalid_cc: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_intr_icmp_i32_invalid_cc: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call i32 @llvm.amdgcn.icmp.i32.i32(i32 %src, i32 100, i32 9999) store i32 %result, i32 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -1,22 +1,40 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) { -; GCN-LABEL: test_wave32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 -; GCN-NEXT: s_load_dword s1, s[4:5], 0x24 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_cselect_b32 s0, 1, 0 -; GCN-NEXT: s_and_b32 s0, 1, s0 -; GCN-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_endpgm +; GFX10-LABEL: test_wave32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_wave32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_eq_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s1, 0, s1 +; GFX11-NEXT: s_or_b32 s0, s1, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %cond = icmp eq i32 %arg0, 0 %break = call i32 @llvm.amdgcn.if.break.i32(i1 %cond, i32 %saved) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s ; TODO: Merge with DAG test @@ -52,6 +53,22 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_private_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id %ptr = load volatile i8*, i8* addrspace(1)* %gep @@ -106,6 +123,23 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB1_2: ; %bb1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_private_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_cmp_lg_u32 s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %bb0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(i8* %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s ; TODO: Merge with DAG test @@ -52,6 +53,22 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_local_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i8*, i8* addrspace(1)* %ptr.ptr, i32 %id %ptr = load volatile i8*, i8* addrspace(1)* %gep @@ -106,6 +123,23 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: .LBB1_2: ; %bb1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: is_local_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, s0, 16 +; GFX11-NEXT: s_cmp_lg_u32 s1, s0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %bb0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr) br i1 %val, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.setreg.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s ; FIXME: This test has a DAG duplicate @@ -33,6 +34,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f32_round_mode_rtz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x80,0xb9,0x03,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 3) call void asm sideeffect "", ""() ret void @@ -60,6 +68,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f64_round_mode_rtz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x80,0xb9,0x03,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2177, i32 3) call void asm sideeffect "", ""() ret void @@ -87,6 +102,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_all_round_mode_rtz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x80,0xb9,0x07,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 7) call void asm sideeffect "", ""() ret void @@ -114,6 +136,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_roundingmode_var: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 %var.mode) call void asm sideeffect "", ""() ret void @@ -140,6 +169,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_ieee_mode_off: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x80,0xb9,0x00,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 0) call void asm sideeffect "", ""() ret void @@ -166,6 +202,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_ieee_mode_on: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x80,0xb9,0x01,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 1) call void asm sideeffect "", ""() ret void @@ -192,6 +235,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_dx10_clamp_off: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x80,0xb9,0x00,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 0) call void asm sideeffect "", ""() ret void @@ -218,6 +268,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_dx10_clamp_on: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x80,0xb9,0x01,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 1) call void asm sideeffect "", ""() ret void @@ -245,6 +302,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 inreg %mode) call void asm sideeffect "", ""() ret void @@ -272,6 +336,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x80,0xb9,0x06,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 12289, i32 6) call void asm sideeffect "", ""() ret void @@ -299,6 +370,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x80,0xb9,0x06,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 4161, i32 6) call void asm sideeffect "", ""() ret void @@ -325,6 +403,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f32_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2305, i32 %val) call void asm sideeffect "", ""() ret void @@ -351,6 +436,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f64_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2433, i32 %val) call void asm sideeffect "", ""() ret void @@ -377,6 +469,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 %val) call void asm sideeffect "", ""() ret void @@ -403,6 +502,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 0) call void asm sideeffect "", ""() ret void @@ -429,6 +535,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 1) call void asm sideeffect "", ""() ret void @@ -455,6 +568,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 2) call void asm sideeffect "", ""() ret void @@ -481,6 +601,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 4) call void asm sideeffect "", ""() ret void @@ -507,6 +634,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 8) call void asm sideeffect "", ""() ret void @@ -533,6 +667,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 15) call void asm sideeffect "", ""() ret void @@ -560,6 +701,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_42: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xa ; encoding: [0x0a,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 42) call void asm sideeffect "", ""() ret void @@ -586,6 +734,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 0) call void asm sideeffect "", ""() ret void @@ -612,6 +767,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 1) call void asm sideeffect "", ""() ret void @@ -639,6 +801,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 2) call void asm sideeffect "", ""() ret void @@ -665,6 +834,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 4) call void asm sideeffect "", ""() ret void @@ -691,6 +867,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 8) call void asm sideeffect "", ""() ret void @@ -717,6 +900,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 15) call void asm sideeffect "", ""() ret void @@ -743,6 +933,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_42: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 10 ; encoding: [0x0a,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 42) call void asm sideeffect "", ""() ret void @@ -771,6 +968,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 0) call void asm sideeffect "", ""() ret void @@ -798,6 +1003,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 1) call void asm sideeffect "", ""() ret void @@ -825,6 +1038,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 2) call void asm sideeffect "", ""() ret void @@ -852,6 +1073,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 4) call void asm sideeffect "", ""() ret void @@ -879,6 +1108,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 8) call void asm sideeffect "", ""() ret void @@ -906,6 +1143,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 16) call void asm sideeffect "", ""() ret void @@ -933,6 +1178,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 32) call void asm sideeffect "", ""() ret void @@ -960,6 +1213,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 64) call void asm sideeffect "", ""() ret void @@ -987,6 +1248,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 128) call void asm sideeffect "", ""() ret void @@ -1014,6 +1283,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 15) call void asm sideeffect "", ""() ret void @@ -1041,6 +1318,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 255) call void asm sideeffect "", ""() ret void @@ -1069,6 +1354,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 5 ; encoding: [0x05,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x5 ; encoding: [0x05,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 5 ; encoding: [0x05,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 597) call void asm sideeffect "", ""() ret void @@ -1095,6 +1388,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x80,0xb9,0xff,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14465, i32 255) call void asm sideeffect "", ""() ret void @@ -1121,6 +1421,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x80,0xb9,0x0f,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 15) call void asm sideeffect "", ""() ret void @@ -1155,6 +1462,17 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s4 ; encoding: [0x01,0x10,0x84,0xb9] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] +; +; GFX11-LABEL: test_setreg_roundingmode_var_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; encoding: [0x00,0x05,0x00,0x7e] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 3), s0 ; encoding: [0x01,0x10,0x00,0xb9] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] call void @llvm.amdgcn.s.setreg(i32 4097, i32 %var.mode) call void asm sideeffect "", ""() ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.writelane.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps float @test_writelane_s_s_s(i32 inreg %data, i32 inreg %lane, i32 inreg %vdst.in) #0 { ; GFX7-LABEL: test_writelane_s_s_s: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -29,6 +30,14 @@ ; GFX10-NEXT: ds_read_b128 v[0:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b128 v[0:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr ret <4 x i32> %load } @@ -185,6 +194,57 @@ ; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7 ; GFX10-NEXT: v_or3_b32 v3, v11, v12, v10 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u8 v1, v0 +; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 +; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 +; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 +; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 +; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 +; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 +; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 +; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 +; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 +; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 +; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 +; GFX11-NEXT: ds_load_u8 v15, v0 offset:15 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:14 +; GFX11-NEXT: s_waitcnt lgkmcnt(14) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(13) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(12) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(10) +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(9) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX11-NEXT: s_waitcnt lgkmcnt(8) +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX11-NEXT: s_waitcnt lgkmcnt(6) +; GFX11-NEXT: v_lshl_or_b32 v7, v10, 8, v9 +; GFX11-NEXT: s_waitcnt lgkmcnt(5) +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v10, v14, 8, v13 +; GFX11-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 24, v15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v0 +; GFX11-NEXT: v_or3_b32 v0, v2, v3, v1 +; GFX11-NEXT: v_or3_b32 v1, v5, v6, v4 +; GFX11-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or3_b32 v3, v11, v12, v10 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load } @@ -257,6 +317,28 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16 v1, v0 +; GFX11-NEXT: ds_load_u16 v2, v0 offset:2 +; GFX11-NEXT: ds_load_u16 v3, v0 offset:4 +; GFX11-NEXT: ds_load_u16 v4, v0 offset:6 +; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 +; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 +; GFX11-NEXT: ds_load_u16 v7, v0 offset:12 +; GFX11-NEXT: ds_load_u16 v8, v0 offset:14 +; GFX11-NEXT: s_waitcnt lgkmcnt(6) +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load } @@ -290,6 +372,16 @@ ; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 ret <4 x i32> %load } @@ -319,6 +411,14 @@ ; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_2addr_b64 v[0:3], v0 offset1:1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 ret <4 x i32> %load } @@ -346,6 +446,14 @@ ; GFX10-NEXT: ds_read_b128 v[0:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b128 v[0:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 ret <4 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -29,6 +30,14 @@ ; GFX10-NEXT: ds_read_b96 v[0:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b96 v[0:2], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr ret <3 x i32> %load } @@ -153,6 +162,46 @@ ; GFX10-NEXT: v_or3_b32 v1, v5, v6, v4 ; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u8 v1, v0 +; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 +; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 +; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 +; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 +; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 +; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 +; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 +; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 +; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:10 +; GFX11-NEXT: s_waitcnt lgkmcnt(10) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(9) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(8) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 24, v4 +; GFX11-NEXT: s_waitcnt lgkmcnt(6) +; GFX11-NEXT: v_lshl_or_b32 v4, v6, 8, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(5) +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v7, v10, 8, v9 +; GFX11-NEXT: s_waitcnt lgkmcnt(1) +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX11-NEXT: v_or3_b32 v0, v2, v3, v1 +; GFX11-NEXT: v_or3_b32 v1, v5, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load } @@ -213,6 +262,24 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16 v1, v0 +; GFX11-NEXT: ds_load_u16 v2, v0 offset:2 +; GFX11-NEXT: ds_load_u16 v3, v0 offset:4 +; GFX11-NEXT: ds_load_u16 v4, v0 offset:6 +; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 +; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load } @@ -246,6 +313,16 @@ ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 ret <3 x i32> %load } @@ -279,6 +356,16 @@ ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 ret <3 x i32> %load } @@ -306,6 +393,14 @@ ; GFX10-NEXT: ds_read_b96 v[0:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b96 v[0:2], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 ret <3 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s ; Unaligned DS access in available from GFX9 onwards. ; LDS alignment enforcement is controlled by a configuration register: @@ -80,6 +81,14 @@ ; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b128 v[0:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load } @@ -145,6 +154,14 @@ ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b96 v[0:2], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load } @@ -200,6 +217,14 @@ ; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_lds_v4i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b128 v0, v[1:4] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void } @@ -248,6 +273,14 @@ ; GFX10-NEXT: ds_write_b32 v0, v3 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: store_lds_v3i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b96 v0, v[1:3] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } @@ -290,6 +323,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test_s_load_constant_v8i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1 store <8 x i32> %load, <8 x i32> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_lshr_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_lshr_i8: @@ -25,14 +26,14 @@ ; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10PLUS-NEXT: v_lshrrev_b16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, %amount ret i8 %result } @@ -58,13 +59,13 @@ ; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i8_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_lshrrev_b16 v0, 7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 7, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i8 %value, 7 ret i8 %result } @@ -76,11 +77,11 @@ ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, %amount ret i8 %result } @@ -91,10 +92,10 @@ ; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i8_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x10007 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x10007 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 ret i8 %result } @@ -109,14 +110,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i24: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i24: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, %amount ret i24 %result } @@ -128,12 +129,12 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 7, 17 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i24_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_bfe_u32 v0, v0, 7, 17 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 7, 17 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, 7 ret i24 %result } @@ -145,11 +146,11 @@ ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i24: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i24: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i24 %value, %amount ret i24 %result } @@ -160,10 +161,10 @@ ; GCN-NEXT: s_bfe_u32 s0, s0, 0x110007 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i24_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x110007 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x110007 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i24 %value, 7 ret i24 %result } @@ -175,12 +176,12 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i32 %value, %amount ret i32 %result } @@ -192,12 +193,12 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i32 %value, 31 ret i32 %result } @@ -208,10 +209,10 @@ ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i32 %value, %amount ret i32 %result } @@ -222,10 +223,10 @@ ; GCN-NEXT: s_lshr_b32 s0, s0, 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i32 %value, 31 ret i32 %result } @@ -246,10 +247,10 @@ ; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_i32_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_i32_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshrrev_b32_e64 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -261,10 +262,10 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_i32_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_i32_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -278,13 +279,13 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v2, v0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -297,13 +298,13 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v2i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 31, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v2i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, 31, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i32> %value, ret <2 x i32> %result } @@ -315,11 +316,11 @@ ; GCN-NEXT: s_lshr_b32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i32> %value, %amount ret <2 x i32> %result } @@ -333,14 +334,14 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v2, v5, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v5, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v2, v5, v2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -353,12 +354,12 @@ ; GCN-NEXT: s_lshr_b32 s2, s2, s5 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, s3 -; GFX10-NEXT: s_lshr_b32 s1, s1, s4 -; GFX10-NEXT: s_lshr_b32 s2, s2, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <3 x i32> %value, %amount ret <3 x i32> %result } @@ -373,15 +374,15 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v4, v0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v5, v1 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -395,13 +396,13 @@ ; GCN-NEXT: s_lshr_b32 s3, s3, s7 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <4 x i32> %value, %amount ret <4 x i32> %result } @@ -417,16 +418,16 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v4, v9, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v5i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, v5, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v6, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, v9, v4 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v5i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v5, v0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v6, v1 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v2, v7, v2 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, v8, v3 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v4, v9, v4 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -441,14 +442,14 @@ ; GCN-NEXT: s_lshr_b32 s4, s4, s9 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v5i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, s5 -; GFX10-NEXT: s_lshr_b32 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s7 -; GFX10-NEXT: s_lshr_b32 s3, s3, s8 -; GFX10-NEXT: s_lshr_b32 s4, s4, s9 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v5i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s6 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s8 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s9 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <5 x i32> %value, %amount ret <5 x i32> %result } @@ -500,6 +501,30 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v15, v31, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_lshr_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, v17, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, v18, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v19, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, v20, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v21, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, v22, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, v23, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, v24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, v25, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, v26, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, v27, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, v28, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, v29, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, v30, v14 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v15, v31, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -525,25 +550,25 @@ ; GCN-NEXT: s_lshr_b32 s15, s15, s31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v16i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s0, s16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s17 -; GFX10-NEXT: s_lshr_b32 s2, s2, s18 -; GFX10-NEXT: s_lshr_b32 s3, s3, s19 -; GFX10-NEXT: s_lshr_b32 s4, s4, s20 -; GFX10-NEXT: s_lshr_b32 s5, s5, s21 -; GFX10-NEXT: s_lshr_b32 s6, s6, s22 -; GFX10-NEXT: s_lshr_b32 s7, s7, s23 -; GFX10-NEXT: s_lshr_b32 s8, s8, s24 -; GFX10-NEXT: s_lshr_b32 s9, s9, s25 -; GFX10-NEXT: s_lshr_b32 s10, s10, s26 -; GFX10-NEXT: s_lshr_b32 s11, s11, s27 -; GFX10-NEXT: s_lshr_b32 s12, s12, s28 -; GFX10-NEXT: s_lshr_b32 s13, s13, s29 -; GFX10-NEXT: s_lshr_b32 s14, s14, s30 -; GFX10-NEXT: s_lshr_b32 s15, s15, s31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v16i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s16 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s17 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s18 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s19 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s20 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s21 +; GFX10PLUS-NEXT: s_lshr_b32 s6, s6, s22 +; GFX10PLUS-NEXT: s_lshr_b32 s7, s7, s23 +; GFX10PLUS-NEXT: s_lshr_b32 s8, s8, s24 +; GFX10PLUS-NEXT: s_lshr_b32 s9, s9, s25 +; GFX10PLUS-NEXT: s_lshr_b32 s10, s10, s26 +; GFX10PLUS-NEXT: s_lshr_b32 s11, s11, s27 +; GFX10PLUS-NEXT: s_lshr_b32 s12, s12, s28 +; GFX10PLUS-NEXT: s_lshr_b32 s13, s13, s29 +; GFX10PLUS-NEXT: s_lshr_b32 s14, s14, s30 +; GFX10PLUS-NEXT: s_lshr_b32 s15, s15, s31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <16 x i32> %value, %amount ret <16 x i32> %result } @@ -569,12 +594,12 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i16 %value, %amount ret i16 %result } @@ -585,11 +610,11 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i16_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i16_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i16 %value, 31 ret i16 %result } @@ -601,11 +626,11 @@ ; GCN-NEXT: s_lshr_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount ret i16 %result } @@ -616,10 +641,10 @@ ; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i16_15: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x1000f -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1000f +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, 15 ret i16 %result } @@ -642,10 +667,10 @@ ; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b16 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshrrev_b16 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -669,10 +694,10 @@ ; GFX9-NEXT: v_lshrrev_b16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b16 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshrrev_b16 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -704,12 +729,12 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, %amount ret <2 x i16> %result } @@ -737,12 +762,12 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v2i16_15: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v2i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, ret <2 x i16> %result } @@ -780,15 +805,15 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s0, s0, s1 -; GFX10-NEXT: s_lshr_b32 s1, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -821,10 +846,10 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_v2i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_lshrrev_b16 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_v2i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -857,10 +882,10 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_v2i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_lshrrev_b16 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_v2i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -917,13 +942,13 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v2, v0 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -982,21 +1007,21 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s2, s4, s5 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast @@ -1089,15 +1114,15 @@ ; GFX9-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v8i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v1, v5, v1 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 -; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v8i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v4, v0 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v1, v5, v1 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v3, v7, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x float> ret <4 x float> %cast @@ -1196,33 +1221,33 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v8i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s4, 16 -; GFX10-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s4, s8, s9 -; GFX10-NEXT: s_lshr_b32 s8, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s5, 16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s5, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_lshr_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v8i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s9, s4, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s9 +; GFX10PLUS-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s9, s5, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s9 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10PLUS-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10PLUS-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s6, s7, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s6 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> ret <4 x i32> %cast @@ -1247,12 +1272,12 @@ ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, %amount ret i64 %result } @@ -1265,13 +1290,13 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i64_63: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 63 ret i64 %result } @@ -1284,13 +1309,13 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i64_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 1, v1 +; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 33 ret i64 %result } @@ -1310,6 +1335,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_lshr_i64_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 32 ret i64 %result } @@ -1333,12 +1365,12 @@ ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr i64 %value, 31 ret i64 %result } @@ -1349,10 +1381,10 @@ ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, %amount ret i64 %result } @@ -1364,11 +1396,11 @@ ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i64_63: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s1, 31 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 31 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, 63 ret i64 %result } @@ -1380,11 +1412,11 @@ ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i64_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, 33 ret i64 %result } @@ -1396,11 +1428,11 @@ ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i64_32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, s1 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i64_32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, s1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, 32 ret i64 %result } @@ -1411,10 +1443,10 @@ ; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, 31 ret i64 %result } @@ -1435,10 +1467,10 @@ ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_i64_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_i64_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1460,10 +1492,10 @@ ; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: lshr_i64_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: lshr_i64_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1491,13 +1523,13 @@ ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v2i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] +; GFX10PLUS-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1524,13 +1556,13 @@ ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_lshr_v2i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_lshr_v2i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] +; GFX10PLUS-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i64> %value, ret <2 x i64> %result } @@ -1542,11 +1574,11 @@ ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_v2i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1640,6 +1672,28 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_lshr_i65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, %amount ret i65 %result } @@ -1693,6 +1747,18 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_lshr_i65_33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, 33 ret i65 %result } @@ -1720,27 +1786,27 @@ ; GCN-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i65: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 1 -; GFX10-NEXT: s_sub_i32 s10, s3, 64 -; GFX10-NEXT: s_sub_i32 s2, 64, s3 -; GFX10-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i65: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 +; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 +; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, %amount ret i65 %result } @@ -1756,15 +1822,15 @@ ; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_lshr_i65_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s0, s1, 1 -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 -; GFX10-NEXT: s_lshr_b32 s2, s3, 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_lshr_i65_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b64 s[4:5], s[2:3], 31 +; GFX10PLUS-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = lshr i65 %value, 33 ret i65 %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX7-LABEL: s_mul_i16: @@ -24,12 +25,12 @@ ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -55,12 +56,12 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_mul_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -88,13 +89,13 @@ ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i16_zeroext: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i16_zeroext: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -119,13 +120,13 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_i16_zeroext: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_mul_i16_zeroext: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -153,13 +154,13 @@ ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i16_signext: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i16_signext: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 +; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -188,13 +189,13 @@ ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_i16_signext: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_mul_i16_signext: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX10PLUS-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -205,10 +206,10 @@ ; GCN-NEXT: s_mul_i32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } @@ -220,12 +221,12 @@ ; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_mul_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul i32 %num, %den ret i32 %result } @@ -237,11 +238,11 @@ ; GCN-NEXT: s_mul_i32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: s_mul_i32 s1, s1, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 +; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -254,13 +255,13 @@ ; GCN-NEXT: v_mul_lo_u32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_mul_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_mul_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -303,15 +304,15 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2 -; GFX10-NEXT: s_mul_i32 s3, s0, s3 -; GFX10-NEXT: s_mul_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s4, s3 -; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s1, s3, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3 +; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2 +; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3 +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 +; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i33 %num, %den ret i33 %result } @@ -354,15 +355,15 @@ ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2 -; GFX10-NEXT: s_mul_i32 s3, s0, s3 -; GFX10-NEXT: s_mul_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s4, s3 -; GFX10-NEXT: s_mul_i32 s0, s0, s2 -; GFX10-NEXT: s_add_i32 s1, s3, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3 +; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2 +; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3 +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 +; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } @@ -389,6 +390,17 @@ ; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 +; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 +; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2 +; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i64 %num, %den ret i64 %result } @@ -466,25 +478,25 @@ ; GFX9-NEXT: s_mov_b32 s0, s6 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i96: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_i32 s6, s0, s5 -; GFX10-NEXT: s_mul_i32 s7, s1, s4 -; GFX10-NEXT: s_mul_i32 s2, s2, s3 -; GFX10-NEXT: s_add_i32 s6, s6, s7 -; GFX10-NEXT: s_mul_hi_u32 s7, s0, s3 -; GFX10-NEXT: s_add_i32 s6, s6, s2 -; GFX10-NEXT: s_mul_i32 s2, s0, s4 -; GFX10-NEXT: s_mul_i32 s5, s0, s3 -; GFX10-NEXT: s_mul_hi_u32 s0, s0, s4 -; GFX10-NEXT: s_add_u32 s2, s2, s7 -; GFX10-NEXT: s_mul_i32 s4, s1, s3 -; GFX10-NEXT: s_addc_u32 s0, s0, s6 -; GFX10-NEXT: s_mul_hi_u32 s3, s1, s3 -; GFX10-NEXT: s_add_u32 s1, s4, s2 -; GFX10-NEXT: s_addc_u32 s2, s3, s0 -; GFX10-NEXT: s_mov_b32 s0, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i96: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_i32 s6, s0, s5 +; GFX10PLUS-NEXT: s_mul_i32 s7, s1, s4 +; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s3 +; GFX10PLUS-NEXT: s_add_i32 s6, s6, s7 +; GFX10PLUS-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX10PLUS-NEXT: s_add_i32 s6, s6, s2 +; GFX10PLUS-NEXT: s_mul_i32 s2, s0, s4 +; GFX10PLUS-NEXT: s_mul_i32 s5, s0, s3 +; GFX10PLUS-NEXT: s_mul_hi_u32 s0, s0, s4 +; GFX10PLUS-NEXT: s_add_u32 s2, s2, s7 +; GFX10PLUS-NEXT: s_mul_i32 s4, s1, s3 +; GFX10PLUS-NEXT: s_addc_u32 s0, s0, s6 +; GFX10PLUS-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX10PLUS-NEXT: s_add_u32 s1, s4, s2 +; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0 +; GFX10PLUS-NEXT: s_mov_b32 s0, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast @@ -519,6 +531,20 @@ ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] ; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_i96: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX11-NEXT: v_mul_lo_u32 v2, v2, v3 +; GFX11-NEXT: v_mul_lo_u32 v5, v6, v5 +; GFX11-NEXT: v_mul_lo_u32 v8, v7, v4 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 +; GFX11-NEXT: v_add3_u32 v2, v5, v8, v2 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i96 %num, %den ret i96 %result } @@ -657,41 +683,41 @@ ; GFX9-NEXT: s_mov_b32 s2, s10 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i128: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_i32 s9, s0, s6 -; GFX10-NEXT: s_mul_i32 s11, s1, s5 -; GFX10-NEXT: s_mul_hi_u32 s10, s0, s6 -; GFX10-NEXT: s_mul_hi_u32 s12, s1, s5 -; GFX10-NEXT: s_add_u32 s9, s11, s9 -; GFX10-NEXT: s_mul_i32 s11, s2, s4 -; GFX10-NEXT: s_addc_u32 s10, s12, s10 -; GFX10-NEXT: s_mul_hi_u32 s12, s2, s4 -; GFX10-NEXT: s_mul_hi_u32 s8, s0, s4 -; GFX10-NEXT: s_add_u32 s9, s11, s9 -; GFX10-NEXT: s_mul_i32 s11, s0, s5 -; GFX10-NEXT: s_addc_u32 s10, s12, s10 -; GFX10-NEXT: s_mul_hi_u32 s12, s0, s5 -; GFX10-NEXT: s_add_u32 s8, s11, s8 -; GFX10-NEXT: s_addc_u32 s9, s12, s9 -; GFX10-NEXT: s_mul_i32 s12, s1, s4 -; GFX10-NEXT: s_mul_hi_u32 s13, s1, s4 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_add_u32 s8, s12, s8 -; GFX10-NEXT: s_mul_i32 s12, s0, s7 -; GFX10-NEXT: s_addc_u32 s7, s13, s9 -; GFX10-NEXT: s_addc_u32 s9, s10, s12 -; GFX10-NEXT: s_mul_i32 s1, s1, s6 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s5 -; GFX10-NEXT: s_addc_u32 s1, s9, s1 -; GFX10-NEXT: s_mul_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_mul_i32 s0, s0, s4 -; GFX10-NEXT: s_add_i32 s3, s1, s3 -; GFX10-NEXT: s_mov_b32 s1, s8 -; GFX10-NEXT: s_mov_b32 s2, s7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i128: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_i32 s9, s0, s6 +; GFX10PLUS-NEXT: s_mul_i32 s11, s1, s5 +; GFX10PLUS-NEXT: s_mul_hi_u32 s10, s0, s6 +; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s1, s5 +; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9 +; GFX10PLUS-NEXT: s_mul_i32 s11, s2, s4 +; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10 +; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s2, s4 +; GFX10PLUS-NEXT: s_mul_hi_u32 s8, s0, s4 +; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9 +; GFX10PLUS-NEXT: s_mul_i32 s11, s0, s5 +; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10 +; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX10PLUS-NEXT: s_add_u32 s8, s11, s8 +; GFX10PLUS-NEXT: s_addc_u32 s9, s12, s9 +; GFX10PLUS-NEXT: s_mul_i32 s12, s1, s4 +; GFX10PLUS-NEXT: s_mul_hi_u32 s13, s1, s4 +; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s8, s12, s8 +; GFX10PLUS-NEXT: s_mul_i32 s12, s0, s7 +; GFX10PLUS-NEXT: s_addc_u32 s7, s13, s9 +; GFX10PLUS-NEXT: s_addc_u32 s9, s10, s12 +; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s6 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s5 +; GFX10PLUS-NEXT: s_addc_u32 s1, s9, s1 +; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s4 +; GFX10PLUS-NEXT: s_add_i32 s1, s1, s2 +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s4 +; GFX10PLUS-NEXT: s_add_i32 s3, s1, s3 +; GFX10PLUS-NEXT: s_mov_b32 s1, s8 +; GFX10PLUS-NEXT: s_mov_b32 s2, s7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast @@ -783,6 +809,28 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo ; GFX10-NEXT: v_add3_u32 v3, v4, v5, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 +; GFX11-NEXT: v_mov_b32_e32 v10, v2 +; GFX11-NEXT: v_mul_lo_u32 v3, v3, v4 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 +; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v9, v5, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v10, v4, v[11:12] +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] +; GFX11-NEXT: v_mul_lo_u32 v5, v10, v5 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v4, v[1:2] +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v7, v6, vcc_lo +; GFX11-NEXT: v_add3_u32 v3, v4, v5, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i128 %num, %den ret i128 %result } @@ -1410,184 +1458,184 @@ ; GFX9-NEXT: s_mov_b32 s6, s22 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_mul_i256: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mul_i32 s17, s0, s10 -; GFX10-NEXT: s_mul_i32 s19, s1, s9 -; GFX10-NEXT: s_mul_hi_u32 s18, s0, s10 -; GFX10-NEXT: s_mul_hi_u32 s20, s1, s9 -; GFX10-NEXT: s_add_u32 s17, s19, s17 -; GFX10-NEXT: s_addc_u32 s18, s20, s18 -; GFX10-NEXT: s_mul_i32 s20, s2, s8 -; GFX10-NEXT: s_mul_hi_u32 s21, s2, s8 -; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_add_u32 s17, s20, s17 -; GFX10-NEXT: s_mul_hi_u32 s16, s0, s8 -; GFX10-NEXT: s_addc_u32 s18, s21, s18 -; GFX10-NEXT: s_mul_i32 s21, s0, s9 -; GFX10-NEXT: s_mul_hi_u32 s22, s0, s9 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_add_u32 s16, s21, s16 -; GFX10-NEXT: s_addc_u32 s17, s22, s17 -; GFX10-NEXT: s_mul_i32 s22, s1, s8 -; GFX10-NEXT: s_mul_hi_u32 s23, s1, s8 -; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_add_u32 s16, s22, s16 -; GFX10-NEXT: s_addc_u32 s17, s23, s17 -; GFX10-NEXT: s_mul_i32 s23, s0, s12 -; GFX10-NEXT: s_mul_i32 s25, s1, s11 -; GFX10-NEXT: s_mul_hi_u32 s24, s0, s12 -; GFX10-NEXT: s_mul_hi_u32 s26, s1, s11 -; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s25, s23 -; GFX10-NEXT: s_addc_u32 s24, s26, s24 -; GFX10-NEXT: s_mul_i32 s26, s2, s10 -; GFX10-NEXT: s_mul_hi_u32 s27, s2, s10 -; GFX10-NEXT: s_cselect_b32 s25, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s26, s23 -; GFX10-NEXT: s_addc_u32 s24, s27, s24 -; GFX10-NEXT: s_mul_i32 s27, s3, s9 -; GFX10-NEXT: s_mul_hi_u32 s28, s3, s9 -; GFX10-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s27, s23 -; GFX10-NEXT: s_addc_u32 s24, s28, s24 -; GFX10-NEXT: s_mul_i32 s28, s4, s8 -; GFX10-NEXT: s_mul_hi_u32 s29, s4, s8 -; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s28, s23 -; GFX10-NEXT: s_addc_u32 s24, s29, s24 -; GFX10-NEXT: s_mul_i32 s29, s0, s11 -; GFX10-NEXT: s_mul_hi_u32 s30, s0, s11 -; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_add_u32 s18, s29, s18 -; GFX10-NEXT: s_addc_u32 s23, s30, s23 -; GFX10-NEXT: s_mul_i32 s30, s1, s10 -; GFX10-NEXT: s_mul_hi_u32 s31, s1, s10 -; GFX10-NEXT: s_cselect_b32 s29, 1, 0 -; GFX10-NEXT: s_add_u32 s18, s30, s18 -; GFX10-NEXT: s_addc_u32 s23, s31, s23 -; GFX10-NEXT: s_mul_i32 s31, s2, s9 -; GFX10-NEXT: s_mul_hi_u32 s33, s2, s9 -; GFX10-NEXT: s_cselect_b32 s30, 1, 0 -; GFX10-NEXT: s_add_u32 s18, s31, s18 -; GFX10-NEXT: s_addc_u32 s23, s33, s23 -; GFX10-NEXT: s_mul_i32 s33, s3, s8 -; GFX10-NEXT: s_mul_hi_u32 s34, s3, s8 -; GFX10-NEXT: s_cselect_b32 s31, 1, 0 -; GFX10-NEXT: s_add_u32 s18, s33, s18 -; GFX10-NEXT: s_addc_u32 s23, s34, s23 -; GFX10-NEXT: s_cselect_b32 s33, 1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s22, 0 -; GFX10-NEXT: s_mul_hi_u32 s22, s0, s14 -; GFX10-NEXT: s_addc_u32 s18, s21, s18 -; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_mul_hi_u32 s34, s1, s13 -; GFX10-NEXT: s_addc_u32 s19, s19, 0 -; GFX10-NEXT: s_cmp_lg_u32 s21, 0 -; GFX10-NEXT: s_mul_i32 s21, s0, s14 -; GFX10-NEXT: s_addc_u32 s19, s19, s23 -; GFX10-NEXT: s_mul_i32 s23, s1, s13 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_add_u32 s21, s23, s21 -; GFX10-NEXT: s_mul_i32 s23, s2, s12 -; GFX10-NEXT: s_addc_u32 s22, s34, s22 -; GFX10-NEXT: s_mul_hi_u32 s34, s2, s12 -; GFX10-NEXT: s_add_u32 s21, s23, s21 -; GFX10-NEXT: s_mul_i32 s23, s3, s11 -; GFX10-NEXT: s_addc_u32 s22, s34, s22 -; GFX10-NEXT: s_mul_hi_u32 s34, s3, s11 -; GFX10-NEXT: s_add_u32 s21, s23, s21 -; GFX10-NEXT: s_mul_i32 s23, s4, s10 -; GFX10-NEXT: s_addc_u32 s22, s34, s22 -; GFX10-NEXT: s_mul_hi_u32 s34, s4, s10 -; GFX10-NEXT: s_add_u32 s21, s23, s21 -; GFX10-NEXT: s_mul_i32 s23, s5, s9 -; GFX10-NEXT: s_addc_u32 s22, s34, s22 -; GFX10-NEXT: s_mul_hi_u32 s34, s5, s9 -; GFX10-NEXT: s_add_u32 s21, s23, s21 -; GFX10-NEXT: s_mul_i32 s23, s6, s8 -; GFX10-NEXT: s_addc_u32 s22, s34, s22 -; GFX10-NEXT: s_mul_hi_u32 s34, s6, s8 -; GFX10-NEXT: s_add_u32 s21, s23, s21 -; GFX10-NEXT: s_mul_i32 s23, s0, s13 -; GFX10-NEXT: s_addc_u32 s22, s34, s22 -; GFX10-NEXT: s_mul_hi_u32 s34, s0, s13 -; GFX10-NEXT: s_add_u32 s23, s23, s24 -; GFX10-NEXT: s_addc_u32 s21, s34, s21 -; GFX10-NEXT: s_mul_i32 s34, s1, s12 -; GFX10-NEXT: s_mul_hi_u32 s35, s1, s12 -; GFX10-NEXT: s_cselect_b32 s24, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s34, s23 -; GFX10-NEXT: s_addc_u32 s21, s35, s21 -; GFX10-NEXT: s_mul_i32 s35, s2, s11 -; GFX10-NEXT: s_mul_hi_u32 s36, s2, s11 -; GFX10-NEXT: s_cselect_b32 s34, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s35, s23 -; GFX10-NEXT: s_addc_u32 s21, s36, s21 -; GFX10-NEXT: s_mul_i32 s36, s3, s10 -; GFX10-NEXT: s_mul_hi_u32 s37, s3, s10 -; GFX10-NEXT: s_cselect_b32 s35, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s36, s23 -; GFX10-NEXT: s_addc_u32 s21, s37, s21 -; GFX10-NEXT: s_mul_i32 s37, s4, s9 -; GFX10-NEXT: s_mul_hi_u32 s38, s4, s9 -; GFX10-NEXT: s_cselect_b32 s36, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s37, s23 -; GFX10-NEXT: s_addc_u32 s21, s38, s21 -; GFX10-NEXT: s_mul_i32 s38, s5, s8 -; GFX10-NEXT: s_mul_hi_u32 s39, s5, s8 -; GFX10-NEXT: s_cselect_b32 s37, 1, 0 -; GFX10-NEXT: s_add_u32 s23, s38, s23 -; GFX10-NEXT: s_addc_u32 s21, s39, s21 -; GFX10-NEXT: s_cselect_b32 s38, 1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s30, 0 -; GFX10-NEXT: s_mul_i32 s1, s1, s14 -; GFX10-NEXT: s_addc_u32 s29, s29, 0 -; GFX10-NEXT: s_cmp_lg_u32 s31, 0 -; GFX10-NEXT: s_mul_i32 s2, s2, s13 -; GFX10-NEXT: s_addc_u32 s29, s29, 0 -; GFX10-NEXT: s_cmp_lg_u32 s33, 0 -; GFX10-NEXT: s_mul_i32 s3, s3, s12 -; GFX10-NEXT: s_addc_u32 s29, s29, 0 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_mul_i32 s4, s4, s11 -; GFX10-NEXT: s_addc_u32 s20, s29, s23 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10-NEXT: s_mul_i32 s26, s0, s15 -; GFX10-NEXT: s_addc_u32 s25, s25, 0 -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_mul_i32 s5, s5, s10 -; GFX10-NEXT: s_addc_u32 s25, s25, 0 -; GFX10-NEXT: s_cmp_lg_u32 s28, 0 -; GFX10-NEXT: s_mul_i32 s6, s6, s9 -; GFX10-NEXT: s_addc_u32 s25, s25, 0 -; GFX10-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10-NEXT: s_mul_i32 s7, s7, s8 -; GFX10-NEXT: s_addc_u32 s15, s25, s21 -; GFX10-NEXT: s_addc_u32 s21, s22, s26 -; GFX10-NEXT: s_cmp_lg_u32 s38, 0 -; GFX10-NEXT: s_mul_i32 s0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s21, s1 -; GFX10-NEXT: s_cmp_lg_u32 s37, 0 -; GFX10-NEXT: s_addc_u32 s1, s1, s2 -; GFX10-NEXT: s_cmp_lg_u32 s36, 0 -; GFX10-NEXT: s_mov_b32 s2, s17 -; GFX10-NEXT: s_addc_u32 s1, s1, s3 -; GFX10-NEXT: s_cmp_lg_u32 s35, 0 -; GFX10-NEXT: s_mov_b32 s3, s18 -; GFX10-NEXT: s_addc_u32 s1, s1, s4 -; GFX10-NEXT: s_cmp_lg_u32 s34, 0 -; GFX10-NEXT: s_mov_b32 s4, s19 -; GFX10-NEXT: s_addc_u32 s1, s1, s5 -; GFX10-NEXT: s_cmp_lg_u32 s24, 0 -; GFX10-NEXT: s_mov_b32 s5, s20 -; GFX10-NEXT: s_addc_u32 s1, s1, s6 -; GFX10-NEXT: s_mov_b32 s6, s15 -; GFX10-NEXT: s_add_i32 s7, s1, s7 -; GFX10-NEXT: s_mov_b32 s1, s16 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_mul_i256: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10 +; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9 +; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17 +; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18 +; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8 +; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17 +; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 +; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9 +; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16 +; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17 +; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8 +; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16 +; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17 +; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12 +; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11 +; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23 +; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24 +; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10 +; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23 +; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24 +; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9 +; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23 +; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24 +; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8 +; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23 +; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24 +; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11 +; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18 +; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23 +; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10 +; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18 +; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23 +; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9 +; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18 +; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23 +; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18 +; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23 +; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0 +; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 +; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14 +; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23 +; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13 +; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 +; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12 +; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 +; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11 +; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 +; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10 +; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 +; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9 +; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 +; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8 +; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 +; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13 +; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 +; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24 +; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21 +; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12 +; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23 +; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21 +; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11 +; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23 +; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21 +; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10 +; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23 +; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21 +; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9 +; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23 +; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21 +; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8 +; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0 +; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23 +; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21 +; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14 +; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0 +; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 +; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0 +; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 +; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 +; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23 +; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15 +; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 +; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 +; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 +; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21 +; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0 +; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 +; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0 +; GFX10PLUS-NEXT: s_mov_b32 s2, s17 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0 +; GFX10PLUS-NEXT: s_mov_b32 s3, s18 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0 +; GFX10PLUS-NEXT: s_mov_b32 s4, s19 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0 +; GFX10PLUS-NEXT: s_mov_b32 s5, s20 +; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6 +; GFX10PLUS-NEXT: s_mov_b32 s6, s15 +; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7 +; GFX10PLUS-NEXT: s_mov_b32 s1, s16 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast @@ -1875,6 +1923,77 @@ ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s4 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v8, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_mul_i256: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX11-NEXT: v_mul_lo_u32 v7, v7, v8 +; GFX11-NEXT: v_mul_lo_u32 v27, v6, v9 +; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 +; GFX11-NEXT: v_mad_u64_u32 v[18:19], null, v16, v12, 0 +; GFX11-NEXT: v_mul_lo_u32 v30, v17, v14 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[18:19], s0, v17, v11, v[18:19] +; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] +; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] +; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] +; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v8, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] +; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 +; GFX11-NEXT: v_mov_b32_e32 v20, v22 +; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo +; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[19:20] +; GFX11-NEXT: v_mov_b32_e32 v20, v18 +; GFX11-NEXT: v_mov_b32_e32 v19, v22 +; GFX11-NEXT: v_mul_lo_u32 v22, v16, v15 +; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] +; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[19:20] +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v8, 0 +; GFX11-NEXT: v_mul_lo_u32 v20, v4, v11 +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 +; GFX11-NEXT: v_mad_u64_u32 v[18:19], s1, v2, v11, v[24:25] +; GFX11-NEXT: v_mul_lo_u32 v25, v3, v12 +; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15] +; GFX11-NEXT: v_mov_b32_e32 v14, v21 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 +; GFX11-NEXT: v_mad_u64_u32 v[18:19], s3, v3, v10, v[18:19] +; GFX11-NEXT: v_mul_lo_u32 v24, v2, v13 +; GFX11-NEXT: v_mov_b32_e32 v13, v1 +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[11:12] +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 +; GFX11-NEXT: v_mad_u64_u32 v[10:11], s2, v4, v9, v[18:19] +; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v16, v9, v[13:14] +; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 +; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v8, v[1:2] +; GFX11-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 +; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v8, v[10:11] +; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v8, v[12:13] +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, vcc_lo, v9, v27, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v7, v8, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den ret i256 %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.v2i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) { ; GFX9-LABEL: v_mul_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -amdgpu-codegenprepare-widen-16-bit-ops=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) { ; GCN-LABEL: s_orn2_i32: @@ -13,6 +14,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 ret i32 %or @@ -28,6 +34,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %not.src1, %src0 ret i32 %or @@ -45,6 +56,12 @@ ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: s_not_b32 s1, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i32_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: s_not_b32 s1, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 %insert.0 = insertvalue { i32, i32 } undef, i32 %or, 0 @@ -64,6 +81,12 @@ ; GFX10-NEXT: s_orn2_b32 s0, s2, s4 ; GFX10-NEXT: s_orn2_b32 s1, s3, s4 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i32_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s4 +; GFX11-NEXT: s_or_not1_b32 s1, s3, s4 +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor i32 %src2, -1 %or0 = or i32 %src0, %not.src2 %or1 = or i32 %src1, %not.src2 @@ -80,13 +103,13 @@ ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_orn2_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_orn2_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 ret i32 %or @@ -99,11 +122,11 @@ ; GCN-NEXT: v_or_b32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_orn2_i32_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_orn2_i32_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 %cast = bitcast i32 %or to float @@ -117,11 +140,11 @@ ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_orn2_i32_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b32 s0, s2 -; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_orn2_i32_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_not_b32 s0, s2 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i32 %src1, -1 %or = or i32 %src0, %not.src1 %cast = bitcast i32 %or to float @@ -138,6 +161,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 ret i64 %or @@ -153,6 +181,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i64_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %not.src1, %src0 ret i64 %or @@ -170,6 +203,12 @@ ; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[6:7] ; GFX10-NEXT: s_orn2_b64 s[2:3], s[4:5], s[6:7] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i64_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b64 s[0:1], s[2:3], s[6:7] +; GFX11-NEXT: s_or_not1_b64 s[2:3], s[4:5], s[6:7] +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor i64 %src2, -1 %or0 = or i64 %src0, %not.src2 %or1 = or i64 %src1, %not.src2 @@ -192,6 +231,12 @@ ; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: s_not_b64 s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: s_not_b64 s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 %insert.0 = insertvalue { i64, i64 } undef, i64 %or, 0 @@ -209,15 +254,15 @@ ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_orn2_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_orn2_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 ret i64 %or @@ -232,13 +277,13 @@ ; GCN-NEXT: v_or_b32_e32 v1, s3, v1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_orn2_i64_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_orn2_i64_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 %cast = bitcast i64 %or to <2 x float> @@ -253,12 +298,12 @@ ; GCN-NEXT: v_or_b32_e32 v1, s1, v1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_orn2_i64_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_not_b64 s[0:1], s[2:3] -; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_orn2_i64_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_not_b64 s[0:1], s[2:3] +; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i64 %src1, -1 %or = or i64 %src0, %not.src1 %cast = bitcast i64 %or to <2 x float> @@ -275,6 +320,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %or = or <2 x i32> %src0, %not.src1 ret <2 x i32> %or @@ -290,6 +340,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b64 s[0:1], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_v2i32_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b64 s[0:1], s[2:3], s[4:5] +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i32> %src1, %or = or <2 x i32> %not.src1, %src0 ret <2 x i32> %or @@ -305,6 +360,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 ret i16 %or @@ -320,6 +380,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i16_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %not.src1, %src0 ret i16 %or @@ -337,6 +402,12 @@ ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: s_xor_b32 s1, s3, -1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i16_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: s_xor_b32 s1, s3, -1 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %insert.0 = insertvalue { i16, i16 } undef, i16 %or, 0 @@ -356,6 +427,12 @@ ; GFX10-NEXT: s_orn2_b32 s0, s2, s4 ; GFX10-NEXT: s_orn2_b32 s1, s3, s4 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_i16_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s4 +; GFX11-NEXT: s_or_not1_b32 s1, s3, s4 +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor i16 %src2, -1 %or0 = or i16 %src0, %not.src2 %or1 = or i16 %src1, %not.src2 @@ -372,13 +449,13 @@ ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_orn2_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_orn2_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 ret i16 %or @@ -392,12 +469,12 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_orn2_i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX10-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_orn2_i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -413,12 +490,12 @@ ; GCN-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: v_orn2_i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_xor_b32 s0, s2, -1 -; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: v_orn2_i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor i16 %src1, -1 %or = or i16 %src0, %not.src1 %zext = zext i16 %or to i32 @@ -448,6 +525,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 %cast = bitcast <2 x i16> %or to i32 @@ -476,6 +558,11 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_v2i16_commute: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %not.src1, %src0 %cast = bitcast <2 x i16> %or to i32 @@ -506,6 +593,12 @@ ; GFX10-NEXT: s_orn2_b32 s0, s2, s3 ; GFX10-NEXT: s_xor_b32 s1, s3, -1 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_v2i16_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s3 +; GFX11-NEXT: s_xor_b32 s1, s3, -1 +; GFX11-NEXT: ; return to shader part epilog %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 @@ -544,6 +637,12 @@ ; GFX10-NEXT: s_orn2_b32 s0, s2, s4 ; GFX10-NEXT: s_orn2_b32 s1, s3, s4 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_orn2_v2i16_multi_foldable_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_or_not1_b32 s0, s2, s4 +; GFX11-NEXT: s_or_not1_b32 s1, s3, s4 +; GFX11-NEXT: ; return to shader part epilog %not.src2 = xor <2 x i16> %src2, %or0 = or <2 x i16> %src0, %not.src2 %or1 = or <2 x i16> %src1, %not.src2 @@ -577,13 +676,13 @@ ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_orn2_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_orn2_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <2 x i16> %src1, %or = or <2 x i16> %src0, %not.src1 ret <2 x i16> %or @@ -650,13 +749,13 @@ ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_orn2_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_orn2_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %src0, %not.src1 %cast = bitcast <4 x i16> %or to i64 @@ -692,13 +791,13 @@ ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_orn2_v4i16_commute: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_orn2_v4i16_commute: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %not.src1, %src0 %cast = bitcast <4 x i16> %or to i64 @@ -736,15 +835,15 @@ ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_orn2_v4i16_multi_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] -; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: s_mov_b32 s3, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_orn2_v4i16_multi_use: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s2, s4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %src0, %not.src1 @@ -792,14 +891,14 @@ ; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_orn2_v4i16_multi_foldable_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s0, -1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] -; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[6:7] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_orn2_v4i16_multi_foldable_use: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s0, -1 +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] +; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[2:3], s[6:7] +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[4:5], s[6:7] +; GFX10PLUS-NEXT: ; return to shader part epilog %not.src2 = xor <4 x i16> %src2, %or0 = or <4 x i16> %src0, %not.src2 %or1 = or <4 x i16> %src1, %not.src2 @@ -844,15 +943,15 @@ ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_orn2_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_orn2_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %not.src1 = xor <4 x i16> %src1, %or = or <4 x i16> %src0, %not.src1 ret <4 x i16> %or diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define float @v_roundeven_f32(float %x) { ; GFX6-LABEL: v_roundeven_f32: @@ -30,12 +31,12 @@ ; GFX9-NEXT: v_rndne_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f32_e32 v0, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_f32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call float @llvm.roundeven.f32(float %x) ret float %roundeven } @@ -69,13 +70,13 @@ ; GFX9-NEXT: v_rndne_f32_e32 v1, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_v2f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f32_e32 v0, v0 -; GFX10-NEXT: v_rndne_f32_e32 v1, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_v2f32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v1, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %x) ret <2 x float> %roundeven } @@ -113,14 +114,14 @@ ; GFX9-NEXT: v_rndne_f32_e32 v2, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_v3f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f32_e32 v0, v0 -; GFX10-NEXT: v_rndne_f32_e32 v1, v1 -; GFX10-NEXT: v_rndne_f32_e32 v2, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_v3f32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v1, v1 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v2, v2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call <3 x float> @llvm.roundeven.v3f32(<3 x float> %x) ret <3 x float> %roundeven } @@ -162,15 +163,15 @@ ; GFX9-NEXT: v_rndne_f32_e32 v3, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_v4f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f32_e32 v0, v0 -; GFX10-NEXT: v_rndne_f32_e32 v1, v1 -; GFX10-NEXT: v_rndne_f32_e32 v2, v2 -; GFX10-NEXT: v_rndne_f32_e32 v3, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_v4f32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, v0 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v1, v1 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v2, v2 +; GFX10PLUS-NEXT: v_rndne_f32_e32 v3, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %x) ret <4 x float> %roundeven } @@ -204,12 +205,12 @@ ; GFX9-NEXT: v_rndne_f16_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f16_e32 v0, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_f16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f16_e32 v0, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call half @llvm.roundeven.f16(half %x) ret half %roundeven } @@ -264,6 +265,17 @@ ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_roundeven_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_rndne_f16_e32 v0, v0 +; GFX11-NEXT: v_rndne_f16_e32 v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x) ret <2 x half> %roundeven } @@ -331,6 +343,18 @@ ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_roundeven_v2f16_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_rndne_f16_e32 v0, v0 +; GFX11-NEXT: v_rndne_f16_e32 v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %roundeven = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %x.fneg) ret <2 x half> %roundeven @@ -408,6 +432,22 @@ ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_roundeven_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_rndne_f16_e32 v0, v0 +; GFX11-NEXT: v_rndne_f16_e32 v1, v1 +; GFX11-NEXT: v_rndne_f16_e32 v2, v2 +; GFX11-NEXT: v_rndne_f16_e32 v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX11-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) ret <4 x half> %roundeven } @@ -438,12 +478,12 @@ ; GFX9-NEXT: v_rndne_f32_e64 v0, |v0| ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_f32_fabs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f32_e64 v0, |v0| -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_f32_fabs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f32_e64 v0, |v0| +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %roundeven = call float @llvm.roundeven.f32(float %fabs.x) ret float %roundeven @@ -470,10 +510,10 @@ ; GFX9-NEXT: v_rndne_f32_e32 v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_roundeven_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_rndne_f32_e32 v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_roundeven_f32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_rndne_f32_e32 v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %roundeven = call float @llvm.roundeven.f32(float %x) ret float %roundeven } @@ -503,12 +543,12 @@ ; GFX9-NEXT: v_rndne_f32_e64 v0, -v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_f32_fneg: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f32_e64 v0, -v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_f32_fneg: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f32_e64 v0, -v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg float %x %roundeven = call float @llvm.roundeven.f32(float %neg.x) ret float %roundeven @@ -548,12 +588,12 @@ ; GFX9-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_f64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call double @llvm.roundeven.f64(double %x) ret double %roundeven } @@ -593,12 +633,12 @@ ; GFX9-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_f64_fneg: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_f64_fneg: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f64_e64 v[0:1], -v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg double %x %roundeven = call double @llvm.roundeven.f64(double %neg.x) ret double %roundeven @@ -648,13 +688,13 @@ ; GFX9-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_roundeven_v2f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] -; GFX10-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_roundeven_v2f64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_rndne_f64_e32 v[0:1], v[0:1] +; GFX10PLUS-NEXT: v_rndne_f64_e32 v[2:3], v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %roundeven = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) ret <2 x double> %roundeven } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s ; Test optimization to reduce shifts to narrower sizes. @@ -20,6 +21,13 @@ ; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_shl_i64_zext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s0, s0, -2.0 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-NEXT: ; return to shader part epilog %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -43,6 +51,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i64_zext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3fffffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 1073741823 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -57,12 +73,12 @@ ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_sext_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0x1fffffff -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_sext_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x1fffffff +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -86,6 +102,14 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i64_sext_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x1fffffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 536870911 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -100,12 +124,12 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_zext_i32_overflow: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bitset0_b32 s0, 31 -; GFX10-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_zext_i32_overflow: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31 +; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -145,6 +169,14 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i64_zext_i32_overflow: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7fffffff, v0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 2147483647 %ext = zext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -159,12 +191,12 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_sext_i32_overflow: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bitset0_b32 s0, 31 -; GFX10-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_sext_i32_overflow: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bitset0_b32 s0, 31 +; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i32 %x, 2147483647 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -196,14 +228,14 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i64_sext_i32_overflow: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i64_sext_i32_overflow: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %and = and i32 %x, 2147483647 %ext = sext i32 %and to i64 %shl = shl i64 %ext, 2 @@ -268,6 +300,20 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX10-NEXT: global_store_dword v[2:3], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: mulu24_shl64: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 6, v0 +; GFX11-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX11-NEXT: global_store_b32 v[2:3], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 6 @@ -353,6 +399,21 @@ ; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: muli24_shl64: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v1, 0xff800000, v1 +; GFX11-NEXT: v_mul_i32_i24_e32 v1, -7, v1 +; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp2 = sext i32 %tmp to i64 @@ -380,17 +441,17 @@ ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v2i64_zext_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_brev_b32 s2, -4 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_mov_b32 s2, s1 -; GFX10-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 -; GFX10-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v2i64_zext_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_brev_b32 s2, -4 +; GFX10PLUS-NEXT: s_mov_b32 s3, s2 +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_mov_b32 s2, s1 +; GFX10PLUS-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GFX10PLUS-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = zext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -442,6 +503,16 @@ ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5] ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_v2i64_zext_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0x3fffffff, v0 +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_and_b32 v4, 0x3fffffff, v1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5] +; GFX11-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i32> %x, %ext = zext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -461,17 +532,17 @@ ; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v2i64_sext_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_brev_b32 s2, -8 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; GFX10-NEXT: s_mov_b32 s2, s1 -; GFX10-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 -; GFX10-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v2i64_sext_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_brev_b32 s2, -8 +; GFX10PLUS-NEXT: s_mov_b32 s3, s2 +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX10PLUS-NEXT: s_mov_b32 s2, s1 +; GFX10PLUS-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GFX10PLUS-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -512,17 +583,17 @@ ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i64_sext_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i64_sext_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX10PLUS-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i32> %x, %ext = sext <2 x i32> %and to <2 x i64> %shl = shl <2 x i64> %ext, @@ -549,11 +620,11 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i32_zext_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0x3fff -; GFX10-NEXT: s_lshl_b32 s0, s0, 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i32_zext_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i16 %x, 16383 %ext = zext i16 %and to i32 %shl = shl i32 %ext, 2 @@ -583,14 +654,14 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i32_zext_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 0x3fff, v0 -; GFX10-NEXT: v_lshlrev_b16 v0, 2, v0 -; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i32_zext_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 2, v0 +; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %and = and i16 %x, 16383 %ext = zext i16 %and to i32 %shl = shl i32 %ext, 2 @@ -630,14 +701,14 @@ ; GFX9-NEXT: s_lshl_b32 s1, s1, 2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v2i32_zext_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0x3fff3fff -; GFX10-NEXT: s_and_b32 s1, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshl_b32 s0, s1, 2 -; GFX10-NEXT: s_lshl_b32 s1, s2, 2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v2i32_zext_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff3fff +; GFX10PLUS-NEXT: s_and_b32 s1, s0, 0xffff +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s1, 2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, 2 +; GFX10PLUS-NEXT: ; return to shader part epilog %and = and <2 x i16> %x, %ext = zext <2 x i16> %and to <2 x i32> %shl = shl <2 x i32> %ext, @@ -690,6 +761,17 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_v2i32_zext_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %and = and <2 x i16> %x, %ext = zext <2 x i16> %and to <2 x i32> %shl = shl <2 x i32> %ext, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -2,7 +2,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_shl_i8(i8 %value, i8 %amount) { ; GFX6-LABEL: v_shl_i8: @@ -24,13 +25,13 @@ ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i8 %value, %amount ret i8 %result } @@ -54,12 +55,12 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 7, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i8_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16 v0, 7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 7, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i8 %value, 7 ret i8 %result } @@ -82,11 +83,11 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, %amount ret i8 %result } @@ -109,11 +110,11 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i8_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s1, 7, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i8_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s1, 7, 0x100000 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i8 %value, 7 ret i8 %result } @@ -127,13 +128,13 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i24: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i24: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i24 %value, %amount ret i24 %result } @@ -145,12 +146,12 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i24_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i24 %value, 7 ret i24 %result } @@ -161,10 +162,10 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i24: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i24: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i24 %value, %amount ret i24 %result } @@ -175,10 +176,10 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, 7 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i24_7: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, 7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i24_7: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i24 %value, 7 ret i24 %result } @@ -190,12 +191,12 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i32 %value, %amount ret i32 %result } @@ -207,12 +208,12 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i32 %value, 31 ret i32 %result } @@ -223,10 +224,10 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, s1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i32 %value, %amount ret i32 %result } @@ -237,10 +238,10 @@ ; GCN-NEXT: s_lshl_b32 s0, s0, 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i32 %value, 31 ret i32 %result } @@ -261,10 +262,10 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_i32_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_i32_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -276,10 +277,10 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, s0, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_i32_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_i32_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i32 %value, %amount %cast = bitcast i32 %result to float ret float %cast @@ -293,13 +294,13 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, v3, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, v3, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i32> %value, %amount ret <2 x i32> %result } @@ -312,13 +313,13 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i32_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 31, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i32_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i32> %value, ret <2 x i32> %result } @@ -330,11 +331,11 @@ ; GCN-NEXT: s_lshl_b32 s1, s1, s3 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <2 x i32> %value, %amount ret <2 x i32> %result } @@ -348,14 +349,14 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v2, v5, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, v3, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, v5, v2 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v2, v5, v2 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <3 x i32> %value, %amount ret <3 x i32> %result } @@ -368,12 +369,12 @@ ; GCN-NEXT: s_lshl_b32 s2, s2, s5 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s4 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <3 x i32> %value, %amount ret <3 x i32> %result } @@ -388,15 +389,15 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v3, v7, v3 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, v7, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, v5, v1 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v2, v6, v2 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, v7, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <4 x i32> %value, %amount ret <4 x i32> %result } @@ -410,13 +411,13 @@ ; GCN-NEXT: s_lshl_b32 s3, s3, s7 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s2, s2, s6 -; GFX10-NEXT: s_lshl_b32 s3, s3, s7 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, s6 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <4 x i32> %value, %amount ret <4 x i32> %result } @@ -432,16 +433,16 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v4, v9, v4 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v5i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, v5, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, v6, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v4, v9, v4 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v5i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, v5, v0 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, v6, v1 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v2, v7, v2 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v3, v8, v3 +; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v4, v9, v4 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <5 x i32> %value, %amount ret <5 x i32> %result } @@ -456,14 +457,14 @@ ; GCN-NEXT: s_lshl_b32 s4, s4, s9 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v5i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s5 -; GFX10-NEXT: s_lshl_b32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s2, s2, s7 -; GFX10-NEXT: s_lshl_b32 s3, s3, s8 -; GFX10-NEXT: s_lshl_b32 s4, s4, s9 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v5i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s6 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, s7 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, s8 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, s9 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <5 x i32> %value, %amount ret <5 x i32> %result } @@ -515,6 +516,30 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v15, v31, v15 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, v17, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, v18, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, v19, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, v20, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, v21, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, v22, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, v23, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, v24, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, v25, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, v26, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, v27, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, v28, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, v29, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, v30, v14 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v15, v31, v15 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl <16 x i32> %value, %amount ret <16 x i32> %result } @@ -540,25 +565,25 @@ ; GCN-NEXT: s_lshl_b32 s15, s15, s31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v16i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s0, s0, s16 -; GFX10-NEXT: s_lshl_b32 s1, s1, s17 -; GFX10-NEXT: s_lshl_b32 s2, s2, s18 -; GFX10-NEXT: s_lshl_b32 s3, s3, s19 -; GFX10-NEXT: s_lshl_b32 s4, s4, s20 -; GFX10-NEXT: s_lshl_b32 s5, s5, s21 -; GFX10-NEXT: s_lshl_b32 s6, s6, s22 -; GFX10-NEXT: s_lshl_b32 s7, s7, s23 -; GFX10-NEXT: s_lshl_b32 s8, s8, s24 -; GFX10-NEXT: s_lshl_b32 s9, s9, s25 -; GFX10-NEXT: s_lshl_b32 s10, s10, s26 -; GFX10-NEXT: s_lshl_b32 s11, s11, s27 -; GFX10-NEXT: s_lshl_b32 s12, s12, s28 -; GFX10-NEXT: s_lshl_b32 s13, s13, s29 -; GFX10-NEXT: s_lshl_b32 s14, s14, s30 -; GFX10-NEXT: s_lshl_b32 s15, s15, s31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v16i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s16 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s17 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, s18 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, s19 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, s20 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, s21 +; GFX10PLUS-NEXT: s_lshl_b32 s6, s6, s22 +; GFX10PLUS-NEXT: s_lshl_b32 s7, s7, s23 +; GFX10PLUS-NEXT: s_lshl_b32 s8, s8, s24 +; GFX10PLUS-NEXT: s_lshl_b32 s9, s9, s25 +; GFX10PLUS-NEXT: s_lshl_b32 s10, s10, s26 +; GFX10PLUS-NEXT: s_lshl_b32 s11, s11, s27 +; GFX10PLUS-NEXT: s_lshl_b32 s12, s12, s28 +; GFX10PLUS-NEXT: s_lshl_b32 s13, s13, s29 +; GFX10PLUS-NEXT: s_lshl_b32 s14, s14, s30 +; GFX10PLUS-NEXT: s_lshl_b32 s15, s15, s31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <16 x i32> %value, %amount ret <16 x i32> %result } @@ -583,12 +608,12 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i16 %value, %amount ret i16 %result } @@ -599,11 +624,11 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i16_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i16_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i16 %value, 31 ret i16 %result } @@ -626,11 +651,11 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount ret i16 %result } @@ -653,11 +678,11 @@ ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i16_15: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_bfe_u32 s1, 15, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_bfe_u32 s1, 15, 0x100000 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, 15 ret i16 %result } @@ -679,10 +704,10 @@ ; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -705,10 +730,10 @@ ; GFX9-NEXT: v_lshlrev_b16_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b16 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i16 %value, %amount %cast = bitcast i16 %result to half ret half %cast @@ -738,12 +763,12 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, v1, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, %amount ret <2 x i16> %result } @@ -771,12 +796,12 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i16_15: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i16_15: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, ret <2 x i16> %result } @@ -813,14 +838,14 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s2, s3 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to i32 ret i32 %cast @@ -853,10 +878,10 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_v2i16_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_v2i16_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -889,10 +914,10 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_v2i16_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s0, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_v2i16_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <2 x i16> %value, %amount %cast = bitcast <2 x i16> %result to float ret float %cast @@ -949,13 +974,13 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, v3, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -1012,19 +1037,19 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, s5 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 -; GFX10-NEXT: s_lshl_b32 s3, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s4, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s4, s5 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <4 x i16> %value, %amount %cast = bitcast <4 x i16> %result to <2 x i32> ret <2 x i32> %cast @@ -1117,15 +1142,15 @@ ; GFX9-NEXT: v_pk_lshlrev_b16 v3, v7, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v8i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 -; GFX10-NEXT: v_pk_lshlrev_b16 v2, v6, v2 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, v7, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v8i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v2, v6, v2 +; GFX10PLUS-NEXT: v_pk_lshlrev_b16 v3, v7, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x float> ret <4 x float> %cast @@ -1220,29 +1245,29 @@ ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v8i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_lshr_b32 s9, s4, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, s9 -; GFX10-NEXT: s_lshr_b32 s8, s1, 16 -; GFX10-NEXT: s_lshr_b32 s9, s5, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s6, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s6 -; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10-NEXT: s_lshl_b32 s3, s3, s7 -; GFX10-NEXT: s_lshl_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v8i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s9, s4, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s8, s9 +; GFX10PLUS-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s9, s5, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s8, s9 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s6, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s2, s2, s6 +; GFX10PLUS-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s6, s7, 16 +; GFX10PLUS-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s5, s6 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <8 x i16> %value, %amount %cast = bitcast <8 x i16> %result to <4 x i32> ret <4 x i32> %cast @@ -1267,12 +1292,12 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, %amount ret i64 %result } @@ -1292,6 +1317,13 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 31, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i64_63: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 31, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 63 ret i64 %result } @@ -1311,6 +1343,13 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i64_33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 33 ret i64 %result } @@ -1330,6 +1369,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i64_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 32 ret i64 %result } @@ -1353,12 +1399,12 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl i64 %value, 31 ret i64 %result } @@ -1369,10 +1415,10 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, %amount ret i64 %result } @@ -1384,11 +1430,11 @@ ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_63: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s1, s0, 31 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_63: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s1, s0, 31 +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, 63 ret i64 %result } @@ -1400,11 +1446,11 @@ ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b32 s1, s0, 1 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b32 s1, s0, 1 +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, 33 ret i64 %result } @@ -1416,11 +1462,11 @@ ; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s1, s0 +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, 32 ret i64 %result } @@ -1431,10 +1477,10 @@ ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 31 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, 31 ret i64 %result } @@ -1455,10 +1501,10 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_i64_sv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_i64_sv: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1480,10 +1526,10 @@ ; GFX9-NEXT: v_lshlrev_b64 v[0:1], s0, v[0:1] ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: shl_i64_vs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s0, v[0:1] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: shl_i64_vs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], s0, v[0:1] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i64 %value, %amount %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -1511,13 +1557,13 @@ ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1544,13 +1590,13 @@ ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 31, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_shl_v2i64_31: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[2:3], 31, v[2:3] -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_shl_v2i64_31: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] +; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 31, v[2:3] +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i64> %value, ret <2 x i64> %result } @@ -1562,11 +1608,11 @@ ; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_v2i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl <2 x i64> %value, %amount ret <2 x i64> %result } @@ -1644,6 +1690,24 @@ ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3 +; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3] +; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 +; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v1, v5, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v1 :: v_dual_cndmask_b32 v1, 0, v7 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl i65 %value, %amount ret i65 %result } @@ -1685,6 +1749,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_shl_i65_33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 31, v[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = shl i65 %value, 33 ret i65 %result } @@ -1710,25 +1783,25 @@ ; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i65: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_sub_i32 s10, s3, 64 -; GFX10-NEXT: s_sub_i32 s4, 64, s3 -; GFX10-NEXT: s_cmp_lt_u32 s3, 64 -; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s3, 0 -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i65: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 +; GFX10PLUS-NEXT: s_sub_i32 s4, 64, s3 +; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 +; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[2:3], s3 +; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[0:1], s3 +; GFX10PLUS-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10PLUS-NEXT: s_lshl_b64 s[6:7], s[0:1], s10 +; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[8:9], 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i65 %value, %amount ret i65 %result } @@ -1745,15 +1818,15 @@ ; GCN-NEXT: s_mov_b32 s1, s4 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_shl_i65_33: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0 -; GFX10-NEXT: s_lshl_b32 s5, s2, 1 -; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], 31 -; GFX10-NEXT: s_lshl_b32 s1, s0, 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GFX10-NEXT: s_mov_b32 s0, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_shl_i65_33: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s4, 0 +; GFX10PLUS-NEXT: s_lshl_b32 s5, s2, 1 +; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[0:1], 31 +; GFX10PLUS-NEXT: s_lshl_b32 s1, s0, 1 +; GFX10PLUS-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GFX10PLUS-NEXT: s_mov_b32 s0, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %result = shl i65 %value, 33 ret i65 %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; Test gfx9+ s_shl[1-4]_add_u32 pattern matching diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -47,6 +48,18 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: ds_store_b128 v4, v[0:3] +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out ret void } @@ -219,6 +232,60 @@ ; GFX10-NEXT: ds_write_b8 v1, v4 offset:14 ; GFX10-NEXT: ds_write_b8 v1, v5 offset:15 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s5, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, s1 +; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX11-NEXT: s_bfe_u32 s8, s6, 0x100000 +; GFX11-NEXT: s_lshr_b32 s9, s2, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: s_lshr_b32 s5, s6, 16 +; GFX11-NEXT: s_lshr_b32 s2, s4, s1 +; GFX11-NEXT: s_lshr_b32 s4, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s8, s1 +; GFX11-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_mov_b32_e32 v8, s4 +; GFX11-NEXT: ds_store_b8 v1, v0 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:2 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:7 +; GFX11-NEXT: v_mov_b32_e32 v3, s5 +; GFX11-NEXT: s_lshr_b32 s2, s7, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: s_lshr_b32 s0, s5, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v5, s7 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: s_bfe_u32 s0, s7, 0x100000 +; GFX11-NEXT: s_lshr_b32 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v6, s0 +; GFX11-NEXT: s_lshr_b32 s0, s2, s1 +; GFX11-NEXT: v_mov_b32_e32 v8, s0 +; GFX11-NEXT: ds_store_b8 v1, v0 offset:8 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:11 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:12 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:13 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:14 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:15 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void } @@ -309,6 +376,31 @@ ; GFX10-NEXT: ds_write_b16 v1, v7 offset:10 ; GFX10-NEXT: ds_write_b16 v1, v8 offset:14 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s4, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s5, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, s7 :: v_dual_mov_b32 v5, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: s_lshr_b32 s2, s6, 16 +; GFX11-NEXT: s_lshr_b32 s3, s7, 16 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s2 +; GFX11-NEXT: v_mov_b32_e32 v8, s3 +; GFX11-NEXT: ds_store_b16 v1, v0 +; GFX11-NEXT: ds_store_b16 v1, v5 offset:2 +; GFX11-NEXT: ds_store_b16 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b16 v1, v6 offset:6 +; GFX11-NEXT: ds_store_b16 v1, v3 offset:8 +; GFX11-NEXT: ds_store_b16 v1, v7 offset:10 +; GFX11-NEXT: ds_store_b16 v1, v4 offset:12 +; GFX11-NEXT: ds_store_b16 v1, v8 offset:14 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void } @@ -357,6 +449,19 @@ ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: v_mov_b32_e32 v4, s7 +; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1 +; GFX11-NEXT: ds_store_2addr_b32 v1, v3, v4 offset0:2 offset1:3 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void } @@ -403,6 +508,18 @@ ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write2_b32 v1, v3, v4 offset0:2 offset1:3 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 ret void } @@ -448,6 +565,18 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_mov_b32_e32 v4, s0 +; GFX11-NEXT: ds_store_b128 v4, v[0:3] +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s ; FIXME: ; XUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s @@ -44,6 +45,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: ds_store_b96 v3, v[0:2] +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out ret void } @@ -181,6 +193,46 @@ ; GFX10-NEXT: ds_write_b8 v1, v2 offset:10 ; GFX10-NEXT: ds_write_b8 v1, v4 offset:11 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_bfe_u32 s1, 8, 0x100000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX11-NEXT: s_lshr_b32 s2, s4, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s5, 16 +; GFX11-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: s_lshr_b32 s5, s6, 16 +; GFX11-NEXT: s_lshr_b32 s3, s3, s1 +; GFX11-NEXT: s_bfe_u32 s7, s6, 0x100000 +; GFX11-NEXT: s_lshr_b32 s6, s2, s1 +; GFX11-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v7, s3 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: s_lshr_b32 s2, s4, s1 +; GFX11-NEXT: s_lshr_b32 s4, s0, s1 +; GFX11-NEXT: s_lshr_b32 s0, s7, s1 +; GFX11-NEXT: s_lshr_b32 s1, s5, s1 +; GFX11-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s2 +; GFX11-NEXT: v_dual_mov_b32 v10, s4 :: v_dual_mov_b32 v11, s0 +; GFX11-NEXT: v_mov_b32_e32 v12, s1 +; GFX11-NEXT: ds_store_b8 v1, v0 +; GFX11-NEXT: ds_store_b8 v1, v7 offset:1 +; GFX11-NEXT: ds_store_b8 v1, v4 offset:2 +; GFX11-NEXT: ds_store_b8 v1, v8 offset:3 +; GFX11-NEXT: ds_store_b8 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b8 v1, v9 offset:5 +; GFX11-NEXT: ds_store_b8 v1, v5 offset:6 +; GFX11-NEXT: ds_store_b8 v1, v10 offset:7 +; GFX11-NEXT: ds_store_b8 v1, v3 offset:8 +; GFX11-NEXT: ds_store_b8 v1, v11 offset:9 +; GFX11-NEXT: ds_store_b8 v1, v6 offset:10 +; GFX11-NEXT: ds_store_b8 v1, v12 offset:11 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } @@ -256,6 +308,27 @@ ; GFX10-NEXT: ds_write_b16 v1, v5 offset:6 ; GFX10-NEXT: ds_write_b16 v1, v6 offset:10 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s4, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s5, 16 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_lshr_b32 s2, s6, 16 +; GFX11-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s0 +; GFX11-NEXT: v_mov_b32_e32 v6, s2 +; GFX11-NEXT: ds_store_b16 v1, v0 +; GFX11-NEXT: ds_store_b16 v1, v3 offset:2 +; GFX11-NEXT: ds_store_b16 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b16 v1, v5 offset:6 +; GFX11-NEXT: ds_store_b16 v1, v4 offset:8 +; GFX11-NEXT: ds_store_b16 v1, v6 offset:10 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void } @@ -301,6 +374,18 @@ ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write_b32 v1, v3 offset:8 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1 +; GFX11-NEXT: ds_store_b32 v1, v3 offset:8 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 ret void } @@ -346,6 +431,18 @@ ; GFX10-NEXT: ds_write2_b32 v1, v0, v2 offset1:1 ; GFX10-NEXT: ds_write_b32 v1, v3 offset:8 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, s6 +; GFX11-NEXT: ds_store_2addr_b32 v1, v0, v2 offset1:1 +; GFX11-NEXT: ds_store_b32 v1, v3 offset:8 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 ret void } @@ -388,6 +485,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v1, s13 +; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: ds_store_b96 v3, v[0:2] +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -4,6 +4,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) { ; GCN-LABEL: scalar_xnor_i32_one_use: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -3,6 +3,7 @@ ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck --check-prefix=GFX11 %s define i32 @zextload_global_i1_to_i32(i1 addrspace(1)* %ptr) { ; GFX9-LABEL: zextload_global_i1_to_i32: @@ -159,6 +160,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i1_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i1, i1 addrspace(1)* %ptr %ext = zext i1 %load to i64 ret i64 %ext @@ -200,6 +210,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i8_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i8, i8 addrspace(1)* %ptr %ext = zext i8 %load to i64 ret i64 %ext @@ -241,6 +260,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i16_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i16, i16 addrspace(1)* %ptr %ext = zext i16 %load to i64 ret i64 %ext @@ -282,6 +310,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i32_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i64 ret i64 %ext @@ -327,6 +364,15 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i32_to_i96: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i96 ret i96 %ext @@ -376,6 +422,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: zextload_global_i32_to_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i128 ret i128 %ext diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_ADD3_U32 diff --git a/llvm/test/CodeGen/AMDGPU/add_shl.ll b/llvm/test/CodeGen/AMDGPU/add_shl.ll --- a/llvm/test/CodeGen/AMDGPU/add_shl.ll +++ b/llvm/test/CodeGen/AMDGPU/add_shl.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_ADD_LSHL_U32 diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll --- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll +++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @test0() { ; GFX9-LABEL: test0: @@ -10,6 +11,10 @@ ; GFX10-LABEL: test0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_endpgm tail call void @llvm.amdgcn.endpgm() unreachable } @@ -25,6 +30,12 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm tail call void @llvm.amdgcn.endpgm() unreachable } @@ -63,6 +74,23 @@ ; GFX10-NEXT: s_endpgm ; GFX10-NEXT: .LBB2_2: ; %then ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: test2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %else +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB2_2: ; %then +; GFX11-NEXT: s_endpgm %cond = icmp sgt i32 %x, 0 br i1 %cond, label %then, label %else diff --git a/llvm/test/CodeGen/AMDGPU/and_or.ll b/llvm/test/CodeGen/AMDGPU/and_or.ll --- a/llvm/test/CodeGen/AMDGPU/and_or.ll +++ b/llvm/test/CodeGen/AMDGPU/and_or.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +;RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_AND_OR_B32 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -4,6 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) @@ -150,6 +152,67 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_constant: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB0_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_constant: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB0_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -304,6 +367,71 @@ ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_uniform: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: .LBB1_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_uniform: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s6, exec_lo +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB1_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -524,6 +652,127 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB2_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) @@ -757,6 +1006,133 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: struct_add_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b32 s7, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: v_mov_b32_e32 v4, s7 +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB3_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: struct_add_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB3_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0) @@ -814,6 +1190,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) @@ -964,6 +1353,69 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_constant: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB5_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_constant: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB5_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -1120,6 +1572,73 @@ ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_uniform: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_uniform: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s6, exec_lo +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB6_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -1340,6 +1859,127 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB7_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) @@ -1397,6 +2037,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sub_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -4,6 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -146,6 +148,81 @@ ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i32_constant: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB0_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i32_constant: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mul_i32 s5, s5, 5 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_mov_b32_e32 v1, s5 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB0_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel store i32 %old, i32 addrspace(1)* %out @@ -332,6 +409,85 @@ ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1] ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i32_uniform: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB1_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mul_i32 s2, s8, s2 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB1_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i32_uniform: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB1_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s8, s6 +; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB1_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] +; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel store i32 %old, i32 addrspace(1)* %out @@ -587,6 +743,144 @@ ; GFX1032-NEXT: s_mov_b32 s2, s6 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i32_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_not_b64 exec, exec +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_not_b64 exec, exec +; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164-NEXT: s_mov_b32 s4, s9 +; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1164-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB2_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s4, s2 +; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB2_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i32_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1132-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_mov_b32 s4, s6 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB2_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s4, s2 +; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel @@ -747,6 +1041,84 @@ ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i64_constant: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB3_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB3_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_constant: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB3_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mul_i32 s5, s5, 5 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB3_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel store i64 %old, i64 addrspace(1)* %out @@ -969,6 +1341,101 @@ ; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2] ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i64_uniform: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB4_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mul_i32 s9, s1, s8 +; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 +; GFX1164-NEXT: s_mul_i32 s8, s0, s8 +; GFX1164-NEXT: s_add_i32 s10, s10, s9 +; GFX1164-NEXT: v_mov_b32_e32 v0, s8 +; GFX1164-NEXT: v_mov_b32_e32 v1, s10 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB4_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, v3 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i64_uniform: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB4_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mul_i32 s8, s1, s3 +; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 +; GFX1132-NEXT: s_mul_i32 s3, s0, s3 +; GFX1132-NEXT: s_add_i32 s9, s9, s8 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: s_mov_b32 s8, s6 +; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB4_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, v3 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel store i64 %old, i64 addrspace(1)* %out @@ -1037,6 +1504,29 @@ ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_i64_varying: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -1220,6 +1710,83 @@ ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i32_constant: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB6_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s6 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB6_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i32_constant: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mul_i32 s5, s5, 5 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_mov_b32_e32 v1, s5 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB6_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel store i32 %old, i32 addrspace(1)* %out @@ -1408,6 +1975,87 @@ ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i32_uniform: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b32 s8, s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB7_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mul_i32 s2, s8, s2 +; GFX1164-NEXT: s_mov_b32 s14, -1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB7_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i32_uniform: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: s_mov_b32 s1, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB7_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mul_i32 s2, s0, s2 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_mov_b32_e32 v1, s2 +; GFX1132-NEXT: s_mov_b32 s8, s6 +; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB7_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel store i32 %old, i32 addrspace(1)* %out @@ -1663,6 +2311,144 @@ ; GFX1032-NEXT: s_mov_b32 s2, s6 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i32_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_not_b64 exec, exec +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_not_b64 exec, exec +; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1164-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1164-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1164-NEXT: s_mov_b32 s4, s9 +; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1164-NEXT: s_mov_b64 exec, s[6:7] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB8_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s4, s2 +; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB8_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i32_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1132-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: s_mov_b32 s4, s6 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB8_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s4, s2 +; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB8_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel @@ -1869,6 +2655,90 @@ ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i64_constant: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB9_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB9_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i64_constant: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB9_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_mul_i32 s5, s5, 5 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB9_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel store i64 %old, i64 addrspace(1)* %out @@ -2100,6 +2970,105 @@ ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: sub_i64_uniform: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB10_2 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_mul_i32 s9, s1, s8 +; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 +; GFX1164-NEXT: s_mul_i32 s8, s0, s8 +; GFX1164-NEXT: s_add_i32 s10, s10, s9 +; GFX1164-NEXT: v_mov_b32_e32 v0, s8 +; GFX1164-NEXT: v_mov_b32_e32 v1, s10 +; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: s_mov_b32 s8, s6 +; GFX1164-NEXT: s_mov_b32 s9, s7 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: .LBB10_2: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 +; GFX1164-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5] +; GFX1164-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v1, v5 +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: sub_i64_uniform: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB10_2 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_mul_i32 s8, s1, s3 +; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 +; GFX1132-NEXT: s_mul_i32 s3, s0, s3 +; GFX1132-NEXT: s_add_i32 s9, s9, s8 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 +; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: s_mov_b32 s8, s6 +; GFX1132-NEXT: s_mov_b32 s9, s7 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: .LBB10_2: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0 +; GFX1132-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5] +; GFX1132-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v1, v5 +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel store i64 %old, i64 addrspace(1)* %out @@ -2168,6 +3137,29 @@ ; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sub_i64_varying: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -4,6 +4,8 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s declare i1 @llvm.amdgcn.wqm.vote(i1) declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) @@ -148,6 +150,85 @@ ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: .LBB0_6: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i32_constant: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_mov_b64 s[10:11], exec +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] +; GFX1164-NEXT: s_cbranch_execz .LBB0_4 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_mov_b64 s[12:13], exec +; GFX1164-NEXT: s_mov_b64 s[10:11], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: s_bcnt1_i32_b64 s12, s[12:13] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_mul_i32 s12, s12, 5 +; GFX1164-NEXT: v_mov_b32_e32 v1, s12 +; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX1164-NEXT: .LBB0_4: ; %Flow +; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] +; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1164-NEXT: ; %bb.5: ; %if +; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i32_constant: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_mov_b32 s9, exec_lo +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 +; GFX1132-NEXT: s_cbranch_execz .LBB0_4 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_mov_b32 s10, exec_lo +; GFX1132-NEXT: s_mov_b32 s9, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0 +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: s_bcnt1_i32_b32 s10, s10 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_mul_i32 s10, s10, 5 +; GFX1132-NEXT: v_mov_b32_e32 v1, s10 +; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc +; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX1132-NEXT: .LBB0_4: ; %Flow +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: s_wqm_b32 s4, -1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6 +; GFX1132-NEXT: ; %bb.5: ; %if +; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) @@ -403,6 +484,143 @@ ; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; GFX1032-NEXT: .LBB1_6: ; %UnifiedReturnBlock ; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: add_i32_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: ; implicit-def: $vgpr4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] +; GFX1164-NEXT: s_cbranch_execz .LBB1_4 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_not_b64 exec, exec +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_not_b64 exec, exec +; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1164-NEXT: v_readlane_b32 s12, v1, 31 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s12 +; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1164-NEXT: v_readlane_b32 s12, v1, 15 +; GFX1164-NEXT: v_readlane_b32 s13, v1, 31 +; GFX1164-NEXT: v_writelane_b32 v3, s12, 16 +; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: v_readlane_b32 s12, v1, 63 +; GFX1164-NEXT: v_readlane_b32 s14, v1, 47 +; GFX1164-NEXT: v_writelane_b32 v3, s13, 32 +; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: s_or_saveexec_b64 s[10:11], -1 +; GFX1164-NEXT: v_writelane_b32 v3, s14, 48 +; GFX1164-NEXT: s_mov_b64 exec, s[10:11] +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-NEXT: ; %bb.2: +; GFX1164-NEXT: v_mov_b32_e32 v0, s12 +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: .LBB1_3: +; GFX1164-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, v3 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_e32 v4, s4, v0 +; GFX1164-NEXT: .LBB1_4: ; %Flow +; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] +; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1164-NEXT: ; %bb.5: ; %if +; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0 +; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock +; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: add_i32_varying: +; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_mov_b32 s8, exec_lo +; GFX1132-NEXT: ; implicit-def: $vgpr4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_mov_b32 s9, s8 +; GFX1132-NEXT: s_and_saveexec_b32 s8, s9 +; GFX1132-NEXT: s_cbranch_execz .LBB1_4 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1132-NEXT: v_readlane_b32 s11, v1, 31 +; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1132-NEXT: v_readlane_b32 s10, v1, 15 +; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_or_saveexec_b32 s9, -1 +; GFX1132-NEXT: v_writelane_b32 v3, s10, 16 +; GFX1132-NEXT: s_mov_b32 exec_lo, s9 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1132-NEXT: ; implicit-def: $vgpr0 +; GFX1132-NEXT: s_and_saveexec_b32 s9, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-NEXT: ; %bb.2: +; GFX1132-NEXT: v_mov_b32_e32 v0, s11 +; GFX1132-NEXT: s_mov_b32 s10, s11 +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: .LBB1_3: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1132-NEXT: v_mov_b32_e32 v0, v3 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_e32 v4, s4, v0 +; GFX1132-NEXT: .LBB1_4: ; %Flow +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: s_wqm_b32 s4, -1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6 +; GFX1132-NEXT: ; %bb.5: ; %if +; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0 +; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock +; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1132-NEXT: s_endpgm entry: %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true) %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -4,6 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) @@ -149,6 +151,67 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_constant: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB0_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_constant: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB0_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -303,6 +366,71 @@ ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_uniform: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: .LBB1_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_uniform: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s6, exec_lo +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB1_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -523,6 +651,127 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB2_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) @@ -580,6 +829,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) @@ -730,6 +992,69 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_constant: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB4_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB4_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_constant: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB4_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB4_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -886,6 +1211,73 @@ ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_uniform: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[4:5] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc +; GFX11W64-NEXT: .LBB5_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_uniform: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s6, exec_lo +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB5_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -1106,6 +1498,127 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB6_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) @@ -1163,6 +1676,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sub_i32_varying_offset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -4,6 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) @@ -154,6 +156,69 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_constant: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB0_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_constant: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB0_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB0_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -313,6 +378,73 @@ ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_uniform: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[12:15], 0 idxen glc +; GFX11W64-NEXT: .LBB1_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_uniform: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s6, exec_lo +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB1_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB1_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1] +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -538,6 +670,129 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB2_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) @@ -595,6 +850,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_i32_varying_vindex: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -665,6 +933,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_varying_offset: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: add_i32_varying_offset: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) @@ -820,6 +1120,71 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_constant: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W64-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB5_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_constant: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: s_mov_b32 s4, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB5_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s5 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_mul_i32 s0, s0, 5 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB5_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -981,6 +1346,75 @@ ; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_uniform: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_clause 0x1 +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_load_b32 s8, s[0:1], 0x44 +; GFX11W64-NEXT: s_mov_b64 s[6:7], exec +; GFX11W64-NEXT: s_mov_b64 s[4:5], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[12:15], s[0:1], 0x34 +; GFX11W64-NEXT: s_bcnt1_i32_b64 s0, s[6:7] +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_mul_i32 s0, s8, s0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[12:15], 0 idxen glc +; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_uniform: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_clause 0x1 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_load_b32 s4, s[0:1], 0x44 +; GFX11W32-NEXT: s_mov_b32 s6, exec_lo +; GFX11W32-NEXT: s_mov_b32 s5, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_bcnt1_i32_b32 s0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_mul_i32 s0, s4, s0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_e32 v1, s0 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB6_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: v_mul_lo_u32 v0, s4, v0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) store i32 %old, i32 addrspace(1)* %out @@ -1206,6 +1640,129 @@ ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] ; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W64-NEXT: s_not_b64 exec, exec +; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 +; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 15 +; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s7, v1, 31 +; GFX11W64-NEXT: v_writelane_b32 v3, s6, 16 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_readlane_b32 s6, v1, 63 +; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 +; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 +; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_varying_vdata: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo +; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 +; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 +; GFX11W32-NEXT: v_readlane_b32 s6, v1, 31 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 +; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 +; GFX11W32-NEXT: ; %bb.1: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_mov_b32 s5, s6 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB7_2: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) @@ -1263,6 +1820,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sub_i32_varying_vindex: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) @@ -1333,6 +1903,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11W64-LABEL: sub_i32_varying_offset: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W64-NEXT: s_endpgm +; +; GFX11W32-LABEL: sub_i32_varying_offset: +; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_mov_b32 s2, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 1 +; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_waitcnt vmcnt(0) +; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11W32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -6,6 +6,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX1010,GFX10W32 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX1030,GFX10W32 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX1030,GFX10W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s ; GCN-ISEL-LABEL: name: sadd64rr ; GCN-ISEL-LABEL: body: @@ -57,6 +58,9 @@ ; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v{{[0-9]+}} ; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]] ; GFX1030: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] +; +; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}} +; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] define amdgpu_kernel void @vadd64rr(i64 addrspace(1)* %out, i64 %a) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -89,6 +93,9 @@ ; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], 0x56789876, v{{[0-9]+}} ; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], 0, 0x1234, [[CARRY]] ; GFX1030: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, 0, 0x1234, [[CARRY]] +; +; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}} +; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, 0, 0x1234, [[CARRY]] define amdgpu_kernel void @vadd64ri(i64 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -134,6 +141,9 @@ ; GFX10W32: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} ; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, s{{[0-9]+}} ; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]] +; +; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX11: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]] define amdgpu_kernel void @uaddo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 @@ -181,6 +191,9 @@ ; GFX10W64: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v0 ; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]] ; GFX1030: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] +; +; GFX11: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0 +; GFX11: v_add_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] define amdgpu_kernel void @vuaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -242,6 +255,9 @@ ; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v{{[0-9]+}} ; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]] ; GFX1030: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] +; +; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}} +; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] define amdgpu_kernel void @vsub64rr(i64 addrspace(1)* %out, i64 %a) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -274,6 +290,9 @@ ; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], 0x56789876, v{{[0-9]+}} ; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], 0x1234, 0, [[CARRY]] ; GFX1030: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, 0x1234, 0, [[CARRY]] +; +; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}} +; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, 0x1234, 0, [[CARRY]] define amdgpu_kernel void @vsub64ri(i64 addrspace(1)* %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -319,6 +338,9 @@ ; GFX10W32: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} ; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, s{{[0-9]+}} ; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]] +; +; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} +; GFX11: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]] define amdgpu_kernel void @usubo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 { %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 @@ -366,6 +388,9 @@ ; GFX10W64: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v0 ; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]] ; GFX1030: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] +; +; GFX11: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0 +; GFX11: v_sub_co_ci_u32_e64 v{{[0-9]+}}, null, s{{[0-9]+}}, 0, [[CARRY]] define amdgpu_kernel void @vusubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { ; GFX803-LABEL: test_kern_empty: @@ -15,6 +16,10 @@ ; GFX1010-LABEL: test_kern_empty: ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_kern_empty: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: s_endpgm entry: ret void } @@ -46,6 +51,14 @@ ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_kern_stack: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 @@ -113,6 +126,24 @@ ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_kern_call: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: v_mov_b32_e32 v31, v0 +; GFX1100-NEXT: s_mov_b32 s12, s13 +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1100-NEXT: s_mov_b32 s13, s14 +; GFX1100-NEXT: s_mov_b32 s14, s15 +; GFX1100-NEXT: s_mov_b32 s32, 0 +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1100-NEXT: s_endpgm + entry: tail call void @ex() #0 ret void @@ -188,6 +219,28 @@ ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_kern_stack_and_call: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: v_mov_b32_e32 v31, v0 +; GFX1100-NEXT: s_mov_b32 s12, s13 +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1100-NEXT: s_mov_b32 s13, s14 +; GFX1100-NEXT: s_mov_b32 s14, s15 +; GFX1100-NEXT: s_mov_b32 s32, 16 +; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm + entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 @@ -210,6 +263,12 @@ ; GFX1010: ; %bb.0: ; %entry ; GFX1010-NEXT: s_mov_b32 s33, 0 ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_force_fp_kern_empty: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: s_mov_b32 s33, 0 +; GFX1100-NEXT: s_endpgm + entry: ret void } @@ -244,6 +303,15 @@ ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_force_fp_kern_stack: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: v_mov_b32_e32 v0, 0 +; GFX1100-NEXT: s_mov_b32 s33, 0 +; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 @@ -314,6 +382,43 @@ ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_force_fp_kern_call: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: v_mov_b32_e32 v31, v0 +; GFX1100-NEXT: s_mov_b32 s12, s13 +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1100-NEXT: s_mov_b32 s13, s14 +; GFX1100-NEXT: s_mov_b32 s14, s15 +; GFX1100-NEXT: s_mov_b32 s32, 0 +; GFX1100-NEXT: s_mov_b32 s33, 0 +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1100-NEXT: s_endpgm +; GFX1010-NEXT s_add_u32 s12, s12, s17 +; GFX1010-NEXT s_mov_b32 s32, 0 +; GFX1010-NEXT s_mov_b32 s33, 0 +; GFX1010-NEXT s_addc_u32 s13, s13, 0 +; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2 +; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1 +; GFX1010-NEXT s_add_u32 s0, s0, s17 +; GFX1010-NEXT s_addc_u32 s1, s1, 0 +; GFX1010-NEXT s_mov_b32 s12, s14 +; GFX1010-NEXT s_mov_b32 s13, s15 +; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2 +; GFX1010-NEXT s_mov_b32 s14, s16 +; GFX1010-NEXT s_getpc_b64 s[18:19] +; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19] +; GFX1010-NEXT s_endpgm entry: tail call void @ex() #2 ret void @@ -392,6 +497,28 @@ ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_force_fp_kern_stack_and_call: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: v_mov_b32_e32 v31, v0 +; GFX1100-NEXT: s_mov_b32 s33, 0 +; GFX1100-NEXT: s_mov_b32 s12, s13 +; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX1100-NEXT: s_mov_b32 s13, s14 +; GFX1100-NEXT: s_mov_b32 s14, s15 +; GFX1100-NEXT: s_mov_b32 s32, 16 +; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 +; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm entry: %x = alloca i32, align 4, addrspace(5) store volatile i32 0, i32 addrspace(5)* %x, align 4 @@ -451,6 +578,22 @@ ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_endpgm +; +; GFX1100-LABEL: test_sgpr_offset_kernel: +; GFX1100: ; %bb.0: ; %entry +; GFX1100-NEXT: scratch_load_b32 v0, off, off offset:8 glc dlc +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: s_movk_i32 s0, 0x1000 +; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill +; GFX1100-NEXT: s_movk_i32 s0, 0x1000 +; GFX1100-NEXT: ;;#ASMSTART +; GFX1100-NEXT: ;;#ASMEND +; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc +; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm entry: ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not ; fit in the instruction, and has to live in the SGPR offset. diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -mattr=-unaligned-access-mode -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GFX11 %s define <2 x half> @chain_hi_to_lo_private() { ; GFX900-LABEL: chain_hi_to_lo_private: @@ -47,6 +49,18 @@ ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_private: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, 2 +; GFX11-NEXT: scratch_load_u16 v0, off, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 %load_lo = load half, half addrspace(5)* %gep_lo @@ -97,6 +111,16 @@ ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_private_different_bases: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_u16 v0, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(5)* %base_lo %load_hi = load half, half addrspace(5)* %base_hi @@ -145,6 +169,16 @@ ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_arithmatic: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %arith_lo = fadd half %in, 1.0 %load_hi = load half, half addrspace(5)* %base @@ -176,6 +210,17 @@ ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_group: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: ds_load_u16 v0, v1 offset:2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 %load_lo = load half, half addrspace(3)* %gep_lo @@ -207,6 +252,16 @@ ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_group_different_bases: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16 v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: ds_load_u16_d16_hi v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(3)* %base_lo %load_hi = load half, half addrspace(3)* %base_hi @@ -244,6 +299,20 @@ ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_global: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 %load_lo = load half, half addrspace(1)* %gep_lo @@ -275,6 +344,16 @@ ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_global_different_bases: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(1)* %base_lo %load_hi = load half, half addrspace(1)* %base_hi @@ -312,6 +391,20 @@ ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_flat: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 2 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: flat_load_u16 v0, v[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[1:2] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half* null, i64 1 %load_lo = load half, half* %gep_lo @@ -343,6 +436,16 @@ ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_flat_different_bases: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_load_u16 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half* %base_lo %load_hi = load half, half* %base_hi @@ -483,6 +586,31 @@ ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR_GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: vload2_private: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b16 off, v0, off offset:6 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u16 v0, v2, s[0:1] offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b16 off, v0, off offset:8 dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 +; GFX11-NEXT: scratch_load_b32 v1, off, off offset:6 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %loc = alloca [3 x i16], align 2, addrspace(5) %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* @@ -534,6 +662,18 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_group_other_dep: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16_d16_hi v1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX11-NEXT: ds_load_u16_d16 v1, v0 offset:2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 %load_lo = load i16, i16 addrspace(3)* %gep_lo @@ -568,6 +708,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_group_other_dep_multi_chain: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16 v1, v0 offset:2 +; GFX11-NEXT: ds_load_u16_d16_hi v0, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo @@ -625,6 +777,18 @@ ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_private_other_dep: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] +; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1 %load_lo = load i16, i16 addrspace(5)* %gep_lo @@ -660,6 +824,19 @@ ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_global_other_dep: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u16 v2, v[0:1], off offset:2 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo @@ -698,6 +875,19 @@ ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_flat_other_dep: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_load_u16 v2, v[0:1] offset:2 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: flat_load_d16_hi_b16 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo @@ -734,6 +924,20 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: chain_hi_to_lo_group_may_alias_store: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX11-NEXT: ds_load_u16 v3, v0 +; GFX11-NEXT: ds_store_b16 v1, v2 +; GFX11-NEXT: ds_load_u16 v0, v0 offset:2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s ; Test that unused lanes in the s_xor result are masked out with v_cndmask. diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -4,6 +4,7 @@ ; RUN: llc < %s -march=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone @@ -83,6 +84,20 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: s_ctlz_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_min_u32 s2, s2, 32 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone store i32 %ctlz, i32 addrspace(1)* %out, align 4 ret void @@ -173,6 +188,21 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep, align 4 @@ -277,6 +307,24 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8 @@ -403,6 +451,29 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 ; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v3, v3 +; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16 @@ -507,6 +578,24 @@ ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u16 v1, v1, -8 +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load i8, i8 addrspace(1)* %valptr %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone store i8 %ctlz, i8 addrspace(1)* %out @@ -592,6 +681,23 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: s_ctlz_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp +; GFX11-NEXT: s_clz_i32_u32 s2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64 +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) store i64 %ctlz, i64 addrspace(1)* %out ret void @@ -671,6 +777,23 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: s_ctlz_i64_trunc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clz_i32_u32 s2, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp +; GFX11-NEXT: s_clz_i32_u32 s2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) %trunc = trunc i64 %ctlz to i32 store i32 %trunc, i32 addrspace(1)* %out @@ -780,6 +903,24 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp +; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid @@ -892,6 +1033,24 @@ ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i64_trunc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 +; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp +; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid @@ -988,6 +1147,19 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -1084,6 +1256,19 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -1192,6 +1377,23 @@ ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -1299,6 +1501,23 @@ ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid %val = load i32, i32 addrspace(1)* %in.gep @@ -1402,6 +1621,20 @@ ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid %val = load i8, i8 addrspace(1)* %valptr.gep @@ -1509,6 +1742,25 @@ ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone %cmp = icmp eq i16 %val, 0 @@ -1616,6 +1868,21 @@ ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 +; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid %val = load i7, i7 addrspace(1)* %valptr.gep diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone @@ -26,6 +27,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_i32_to_f32_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to float ret float %cvt @@ -50,6 +58,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sitofp_i32_to_f32_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = sitofp i32 %masked to float ret float %cvt @@ -77,6 +92,15 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 7, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f32_lshr7_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.7 = lshr i32 %arg0, 7 %masked = and i32 %lshr.7, 255 %cvt = uitofp i32 %masked to float @@ -102,6 +126,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f32_lshr8_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to float @@ -149,6 +180,16 @@ ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 store i32 %lshr.8, i32 addrspace(1)* undef %masked = and i32 %lshr.8, 255 @@ -175,6 +216,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f32_lshr16_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to float @@ -200,6 +248,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f32_lshr24_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to float @@ -225,6 +280,13 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_i8_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to float ret float %cvt } @@ -254,6 +316,16 @@ ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_v2i8_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = bitcast i16 %arg0 to <2 x i8> %cvt = uitofp <2 x i8> %val to <2 x float> ret <2 x float> %cvt @@ -287,6 +359,17 @@ ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_v3i8_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i32 %arg0 to i24 %val = bitcast i24 %trunc to <3 x i8> %cvt = uitofp <3 x i8> %val to <3 x float> @@ -324,6 +407,18 @@ ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_v4i8_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = bitcast i32 %arg0 to <4 x i8> %cvt = uitofp <4 x i8> %val to <4 x float> ret <4 x float> %cvt @@ -360,6 +455,18 @@ ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_unpack_i32_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_mov_b32_e32 v0, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %mask.arg0 = and i32 %arg0, 255 %cvt0 = uitofp i32 %mask.arg0 to float @@ -412,6 +519,15 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_i32_to_f16_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to half ret half %cvt @@ -447,6 +563,15 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_sitofp_i32_to_f16_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = sitofp i32 %masked to half ret half %cvt @@ -482,6 +607,15 @@ ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f16_lshr8_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to half @@ -518,6 +652,15 @@ ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f16_lshr16_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to half @@ -554,6 +697,15 @@ ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f16_lshr24_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to half @@ -587,6 +739,15 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_i8_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to half ret half %cvt } @@ -613,6 +774,15 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_i32_to_f64_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 %cvt = uitofp i32 %masked to double ret double %cvt @@ -640,6 +810,15 @@ ; GFX9-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f64_lshr8_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 %masked = and i32 %lshr.8, 255 %cvt = uitofp i32 %masked to double @@ -668,6 +847,15 @@ ; GFX9-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f64_lshr16_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to double @@ -696,6 +884,15 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_to_f64_lshr24_mask255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24 %masked = and i32 %lshr.16, 255 %cvt = uitofp i32 %masked to double @@ -734,6 +931,16 @@ ; GFX9-NEXT: v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_uitofp_i8_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %cvt = uitofp i8 %arg0 to double ret double %cvt } @@ -796,6 +1003,20 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_i8_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 @@ -870,6 +1091,21 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v2i8_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2 @@ -949,6 +1185,21 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v3i8_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 @@ -1031,6 +1282,23 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v4i8_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 @@ -1148,6 +1416,29 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 +; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 @@ -1301,6 +1592,49 @@ ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: global_store_dword v5, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v1, v1, 9 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX11-NEXT: global_store_b32 v4, v5, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 @@ -1459,6 +1793,38 @@ ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v7i8_to_v7f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_lshlrev_b32 v0, 3, v0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:6 +; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 +; GFX11-NEXT: global_load_u8 v6, v0, s[2:3] offset:1 +; GFX11-NEXT: global_load_d16_b16 v4, v0, s[2:3] offset:4 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v6 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v5 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b96 v7, v[4:6], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1 @@ -1561,6 +1927,29 @@ ; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: load_v8i8_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v5, v9 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v8 +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8 @@ -1635,6 +2024,21 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: i8_zext_inreg_i32_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -1707,6 +2111,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %load = load i32, i32 addrspace(1)* %gep, align 4 @@ -1777,6 +2194,20 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: i8_zext_i32_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid %load = load i8, i8 addrspace(1)* %gep, align 1 @@ -1891,6 +2322,29 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 +; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1 @@ -1962,6 +2416,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: extract_byte0_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -2033,6 +2500,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: extract_byte1_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -2105,6 +2585,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: extract_byte2_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -2177,6 +2670,19 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: extract_byte3_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid %val = load i32, i32 addrspace(1)* %gep @@ -2256,6 +2762,22 @@ ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-NEXT: global_store_dword v1, v0, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: cvt_ubyte0_or_multiuse: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %lid @@ -2379,6 +2901,33 @@ ; GFX9-NEXT: global_store_byte v[0:1], v6, off ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_branch .LBB40_1 +; +; GFX11-LABEL: cvt_f32_ubyte0_vector: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_u8 v1, v0, s[0:1] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] offset:2 +; GFX11-NEXT: global_load_u8 v3, v0, s[0:1] offset:1 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f32 v1, s0, v1, 0.5 +; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: global_store_b8 v[0:1], v3, off +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: global_store_b8 v[0:1], v1, off +; GFX11-NEXT: .LBB40_1: ; %for.body.i +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_branch .LBB40_1 entry: br label %for.body.i diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -32,6 +33,14 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:12 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: write_ds_sub0_offset0_global: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 offset:12 +; GFX11-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 0, %x.i @@ -94,6 +103,23 @@ ; GFX10-NEXT: global_store_dword v[0:1], v4, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: ds_store_b32 v2, v3 offset:12 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 0, %x.i @@ -158,6 +184,23 @@ ; GFX10-NEXT: global_store_dword v[0:1], v4, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_not_b32_e32 v0, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: ds_store_b32 v2, v3 offset:65532 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 %sub1 = sub i32 -1, %x.i %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 @@ -190,6 +233,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_max_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 +; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %shl = shl i32 %x.i, 4 %add = add i32 %shl, 65535 @@ -224,6 +273,14 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 +; GFX11-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() %.neg = mul i32 %x.i, -4 %add = add i32 %.neg, 65535 @@ -258,6 +315,14 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 ; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -292,6 +357,14 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 ; GFX10-NEXT: ds_write_b8 v0, v1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 +; GFX11-NEXT: ds_store_b8 v0, v1 +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -329,6 +402,15 @@ ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:456 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 +; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -369,6 +451,15 @@ ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 ; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 +; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -408,6 +499,15 @@ ; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023 ; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 +; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -474,6 +574,24 @@ ; GFX10-NEXT: global_store_dword v[0:1], v5, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 @@ -514,6 +632,15 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 ; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 +; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 +; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 %neg = sub i32 0, %x.i %shl = shl i32 %neg, 2 diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -66,6 +66,10 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1034 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1034 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1035 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1035 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1036 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1036 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1100 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1100 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1101 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1101 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1102 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1102 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1103 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1103 %s ; FIXME: With the default attributes the eflags are not accurate for ; xnack and sramecc. Subsequent Target-ID patches will address this. @@ -129,6 +133,10 @@ ; GFX1034: EF_AMDGPU_MACH_AMDGCN_GFX1034 (0x3E) ; GFX1035: EF_AMDGPU_MACH_AMDGCN_GFX1035 (0x3D) ; GFX1036: EF_AMDGPU_MACH_AMDGCN_GFX1036 (0x45) +; GFX1100: EF_AMDGPU_MACH_AMDGCN_GFX1100 (0x41) +; GFX1101: EF_AMDGPU_MACH_AMDGCN_GFX1101 (0x46) +; GFX1102: EF_AMDGPU_MACH_AMDGCN_GFX1102 (0x47) +; GFX1103: EF_AMDGPU_MACH_AMDGCN_GFX1103 (0x44) ; ALL: ] define amdgpu_kernel void @elf_header() { diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 < %s | FileCheck -check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define i32 @s_add_co_select_user() { ; GFX7-LABEL: s_add_co_select_user: @@ -55,6 +56,25 @@ ; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_add_co_select_user: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_cmpk_lg_u32 s1, 0x0 +; GFX11-NEXT: s_addc_u32 s1, s0, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_gt_u32 s0, 31 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s2 +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = load volatile i32, i32 addrspace(4)* null, align 8 %i1 = add i32 %i, %i @@ -148,6 +168,33 @@ ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_add_co_br_user: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_i32 s1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lt_u32 s1, s0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX11-NEXT: s_cmpk_lg_u32 s1, 0x0 +; GFX11-NEXT: s_addc_u32 s0, s0, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0 +; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %bb0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 9 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: %i1 = add i32 %i, %i %i2 = icmp ult i32 %i1, %i diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX7-UNALIGNED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-access-mode < %s | FileCheck --check-prefix=GFX11 %s ; Should not merge this to a dword load define i32 @global_load_2xi16_align2(i16 addrspace(1)* %p) #0 { @@ -51,6 +52,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_load_2xi16_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v2, v[0:1], off +; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 2 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2 @@ -116,6 +128,18 @@ ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: global_store_short v0, v2, s[0:1] offset:2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_2xi16_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] offset:2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 2 store i16 2, i16 addrspace(1)* %gep.r, align 2 @@ -176,6 +200,17 @@ ; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_load_2xi16_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 1 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 1 @@ -242,6 +277,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_2xi16_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 1 store i16 2, i16 addrspace(1)* %gep.r, align 1 @@ -291,6 +335,17 @@ ; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_load_2xi16_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 %p.0 = load i16, i16 addrspace(1)* %p, align 4 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2 @@ -350,6 +405,15 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_2xi16_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 store i16 1, i16 addrspace(1)* %r, align 4 store i16 2, i16 addrspace(1)* %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-FLASTSCR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-FLASTSCR %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11-FLASTSCR %s ; Should not merge this to a dword load define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { @@ -71,6 +73,28 @@ ; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_load_2xi16_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v1, v0, off +; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2: +; GFX11-FLASTSCR: ; %bb.0: +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: s_clause 0x1 +; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v0, off +; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off offset:2 +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 2 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -146,6 +170,30 @@ ; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v2, off offset:2 ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_store_2xi16_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b16 v1, v0, off +; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLASTSCR-LABEL: private_store_2xi16_align2: +; GFX11-FLASTSCR: ; %bb.0: +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-FLASTSCR-NEXT: s_clause 0x1 +; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v0, off +; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v2, off offset:2 +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 2 store i16 2, i16 addrspace(5)* %gep.r, align 2 @@ -224,6 +272,28 @@ ; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 ; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_load_2xi16_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1: +; GFX11-FLASTSCR: ; %bb.0: +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 1 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 @@ -293,6 +363,24 @@ ; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_store_2xi16_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: scratch_store_b32 v1, v0, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLASTSCR-LABEL: private_store_2xi16_align1: +; GFX11-FLASTSCR: ; %bb.0: +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 1 store i16 2, i16 addrspace(5)* %gep.r, align 1 @@ -363,6 +451,28 @@ ; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 ; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_load_2xi16_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v0, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4: +; GFX11-FLASTSCR: ; %bb.0: +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 +; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 %p.0 = load i16, i16 addrspace(5)* %p, align 4 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 @@ -434,6 +544,24 @@ ; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off ; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_store_2xi16_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: scratch_store_b32 v1, v0, off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FLASTSCR-LABEL: private_store_2xi16_align4: +; GFX11-FLASTSCR: ; %bb.0: +; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off +; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 4 store i16 2, i16 addrspace(5)* %gep.r, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll --- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll +++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll @@ -5,6 +5,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=NOFUSE %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=FMA %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=FMAGFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefix=FMAGFX11 %s ; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s @@ -12,6 +13,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAD %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMADGFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefix=FMAGFX11 %s ; Check for incorrect fmad formation when distributing @@ -36,6 +38,13 @@ ; FMAGFX10-NEXT: v_fmac_f32_e32 v0, v1, v0 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX11-LABEL: unsafe_fmul_fadd_distribute_fast_f32: +; FMAGFX11: ; %bb.0: +; FMAGFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX11-NEXT: v_fmac_f32_e32 v0, v1, v0 +; FMAGFX11-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -74,6 +83,13 @@ ; FMAGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX11-LABEL: unsafe_fmul_fsub_distribute_fast_f32: +; FMAGFX11: ; %bb.0: +; FMAGFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX11-NEXT: v_fma_f32 v0, -v1, v0, v0 +; FMAGFX11-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -116,6 +132,13 @@ ; FMAGFX10-NEXT: v_fmac_f32_e32 v1, v3, v1 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX11-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32: +; FMAGFX11: ; %bb.0: +; FMAGFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX11-NEXT: v_dual_fmac_f32 v0, v2, v0 :: v_dual_fmac_f32 v1, v3, v1 +; FMAGFX11-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fadd_distribute_fast_v2f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -160,6 +183,14 @@ ; FMAGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX11-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32: +; FMAGFX11: ; %bb.0: +; FMAGFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX11-NEXT: v_fma_f32 v0, -v2, v0, v0 +; FMAGFX11-NEXT: v_fma_f32 v1, -v3, v1, v1 +; FMAGFX11-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -200,6 +231,13 @@ ; FMAGFX10-NEXT: v_fma_f32 v0, v0, v1, v1 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX11-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32: +; FMAGFX11: ; %bb.0: +; FMAGFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX11-NEXT: v_fma_f32 v0, v0, v1, v1 +; FMAGFX11-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fast_fmul_fadd_distribute_post_legalize_f32: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -239,6 +277,13 @@ ; FMAGFX10-NEXT: v_fma_f32 v0, -v0, v1, v1 ; FMAGFX10-NEXT: s_setpc_b64 s[30:31] ; +; FMAGFX11-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize: +; FMAGFX11: ; %bb.0: +; FMAGFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; FMAGFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; FMAGFX11-NEXT: v_fma_f32 v0, -v0, v1, v1 +; FMAGFX11-NEXT: s_setpc_b64 s[30:31] +; ; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize: ; FMAD: ; %bb.0: ; FMAD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fold-legalize-dag-increase-insts.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s --check-prefix=GFX9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-no-signed-zeros-fp-math=true < %s | FileCheck %s --check-prefix=GFX10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s --check-prefix=GFX10 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -enable-no-signed-zeros-fp-math=true < %s | FileCheck %s --check-prefix=GFX10 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -enable-no-signed-zeros-fp-math=false < %s | FileCheck %s --check-prefix=GFX10 ; no-signed-zeros-fp-math should not increase the number of ; instructions emitted. diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -3,11 +3,13 @@ ; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1100 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10 ; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030 +; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1100 declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg) declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg) @@ -56,6 +58,17 @@ ; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; +; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: +; GFX1100: ; %bb.0: ; %main_body +; GFX1100-NEXT: s_clause 0x1 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -97,6 +110,17 @@ ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 ; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm +; +; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32: +; G_GFX1100: ; %bb.0: ; %main_body +; G_GFX1100-NEXT: s_clause 0x1 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -135,6 +159,14 @@ ; GFX1030-NEXT: global_store_dword v[0:1], v0, off ; GFX1030-NEXT: s_endpgm ; +; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32: +; GFX1100: ; %bb.0: ; %main_body +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc @@ -166,6 +198,14 @@ ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: global_store_dword v[0:1], v0, off ; G_GFX1030-NEXT: s_endpgm +; +; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32: +; G_GFX1100: ; %bb.0: ; %main_body +; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc +; G_GFX1100-NEXT: s_waitcnt vmcnt(0) +; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store float %ret, float addrspace(1)* undef @@ -234,6 +274,20 @@ ; GFX1030-NEXT: ds_write_b32 v1, v0 ; GFX1030-NEXT: s_endpgm ; +; GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: +; GFX1100: ; %bb.0: ; %main_body +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, s0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: ds_store_b32 v1, v0 +; GFX1100-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd @@ -294,6 +348,20 @@ ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b32 v1, v0 ; G_GFX1030-NEXT: s_endpgm +; +; G_GFX1100-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: +; G_GFX1100: ; %bb.0: ; %main_body +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b32 s0, s[0:1], 0x3c +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: v_mov_b32_e32 v1, s0 +; G_GFX1100-NEXT: s_waitcnt vmcnt(0) +; G_GFX1100-NEXT: ds_store_b32 v1, v0 +; G_GFX1100-NEXT: s_endpgm ; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc: main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) @@ -344,6 +412,17 @@ ; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; GFX1030-NEXT: s_endpgm ; +; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: +; GFX1100: ; %bb.0: ; %main_body +; GFX1100-NEXT: s_clause 0x1 +; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd @@ -385,6 +464,17 @@ ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 ; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen ; G_GFX1030-NEXT: s_endpgm +; +; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32: +; G_GFX1100: ; %bb.0: ; %main_body +; G_GFX1100-NEXT: s_clause 0x1 +; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -423,6 +513,14 @@ ; GFX1030-NEXT: global_store_dword v[0:1], v0, off ; GFX1030-NEXT: s_endpgm ; +; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32: +; GFX1100: ; %bb.0: ; %main_body +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc @@ -454,6 +552,14 @@ ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: global_store_dword v[0:1], v0, off ; G_GFX1030-NEXT: s_endpgm +; +; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32: +; G_GFX1100: ; %bb.0: ; %main_body +; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc +; G_GFX1100-NEXT: s_waitcnt vmcnt(0) +; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store float %ret, float addrspace(1)* undef @@ -521,6 +627,21 @@ ; GFX1030-NEXT: global_store_dword v1, v0, s[0:1] ; GFX1030-NEXT: s_endpgm ; +; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: +; GFX1100: ; %bb.0: ; %main_body +; GFX1100-NEXT: s_clause 0x2 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; GFX1100-NEXT: s_waitcnt vmcnt(0) +; GFX1100-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX1100-NEXT: s_endpgm +; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body ; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd @@ -580,6 +701,21 @@ ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: global_store_dword v1, v0, s[0:1] ; G_GFX1030-NEXT: s_endpgm +; +; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: +; G_GFX1100: ; %bb.0: ; %main_body +; G_GFX1100-NEXT: s_clause 0x2 +; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 +; G_GFX1100-NEXT: s_waitcnt vmcnt(0) +; G_GFX1100-NEXT: global_store_b32 v1, v0, s[0:1] +; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; G_GFX1100-NEXT: s_endpgm main_body: %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store float %ret, float addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define float @v_pow_f32(float %x, float %y) { ; GFX6-LABEL: v_pow_f32: @@ -46,6 +47,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -106,6 +118,19 @@ ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_exp_f32_e32 v1, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: v_log_f32_e32 v1, v1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v0, v2, v0 :: v_dual_mul_dx9_zero_f32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x float> @llvm.pow.v2f32(<2 x float> %x, <2 x float> %y) ret <2 x float> %pow } @@ -167,6 +192,21 @@ ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call half @llvm.pow.f16(half %x, half %y) ret half %pow } @@ -263,6 +303,33 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_log_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow } @@ -363,6 +430,33 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16_fneg_lhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-NEXT: v_log_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) ret <2 x half> %pow @@ -464,6 +558,33 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16_fneg_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; GFX11-NEXT: v_log_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) ret <2 x half> %pow @@ -569,6 +690,33 @@ ; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_f16_e64 v2, -v2 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e64 v3, -v3 +; GFX11-NEXT: v_log_f32_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: v_exp_f32_e32 v1, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg) @@ -622,6 +770,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fabs_lhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e64 v0, |v0| +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %pow = call float @llvm.pow.f32(float %fabs.x, float %y) ret float %pow @@ -668,6 +827,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, |v1|, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, |v1|, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %x, float %fabs.y) ret float %pow @@ -714,6 +884,17 @@ ; GFX10-NEXT: v_mul_legacy_f32_e64 v0, |v1|, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_pow_f32_fabs_lhs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_log_f32_e64 v0, |v0| +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e64 v0, |v1|, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %fabs.y = call float @llvm.fabs.f32(float %y) %pow = call float @llvm.pow.f32(float %fabs.x, float %fabs.y) @@ -755,6 +936,15 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_pow_f32_sgpr_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_log_f32_e32 v1, s0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -794,6 +984,15 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s0, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_pow_f32_vgpr_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_log_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } @@ -833,6 +1032,15 @@ ; GFX10-NEXT: v_mul_legacy_f32_e32 v0, s1, v0 ; GFX10-NEXT: v_exp_f32_e32 v0, v0 ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: v_pow_f32_sgpr_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_log_f32_e32 v0, s0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_exp_f32_e32 v0, v0 +; GFX11-NEXT: ; return to shader part epilog %pow = call float @llvm.pow.f32(float %x, float %y) ret float %pow } diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -4,6 +4,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, ; SI-LABEL: frem_f16: @@ -159,6 +160,34 @@ ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %in2) #0 { %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 %r0 = load half, half addrspace(1)* %in1, align 4 @@ -283,6 +312,27 @@ ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fast_frem_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_rcp_f16_e32 v3, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %in2) #0 { %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 %r0 = load half, half addrspace(1)* %in1, align 4 @@ -407,6 +457,27 @@ ; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: unsafe_frem_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] +; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_rcp_f16_e32 v3, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v3, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %in2) #1 { %gep2 = getelementptr half, half addrspace(1)* %in2, i32 4 %r0 = load half, half addrspace(1)* %in1, align 4 @@ -578,6 +649,42 @@ ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v5, v4 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5 +; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5 +; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm float addrspace(1)* %in2) #0 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -694,6 +801,27 @@ ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fast_frem_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm float addrspace(1)* %in2) #0 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -810,6 +938,27 @@ ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: unsafe_frem_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm float addrspace(1)* %in2) #1 { %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4 %r0 = load float, float addrspace(1)* %in1, align 4 @@ -991,6 +1140,40 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7] +; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX11-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -1149,6 +1332,36 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fast_frem_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7] +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm double addrspace(1)* %in2) #0 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -1307,6 +1520,36 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: unsafe_frem_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v10, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7] +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; GFX11-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1] +; GFX11-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] +; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm double addrspace(1)* %in2) #1 { %r0 = load double, double addrspace(1)* %in1, align 8 %r1 = load double, double addrspace(1)* %in2, align 8 @@ -1547,6 +1790,50 @@ ; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 +; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f16_e32 v4, v4 +; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x half>, <2 x half> addrspace(1)* %in2, i32 4 %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1, align 8 @@ -1926,6 +2213,77 @@ ; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 +; GFX11-NEXT: v_trunc_f16_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GFX11-NEXT: v_rcp_f32_e32 v7, v7 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f16_e32 v6, v6 +; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f16_e32 v3, v3 +; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0 +; GFX11-NEXT: v_trunc_f16_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0 +; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <4 x half> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x half>, <4 x half> addrspace(1)* %in2, i32 4 %r0 = load <4 x half>, <4 x half> addrspace(1)* %in1, align 16 @@ -2172,6 +2530,63 @@ ; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v7, v6 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7 +; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7 +; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1 +; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6 +; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6 +; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8 @@ -2568,6 +2983,105 @@ ; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 +; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v11, v10 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11 +; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11 +; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v9, v9 +; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3 +; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 +; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v10, v9 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10 +; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10 +; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v7, v7 +; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2 +; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 +; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v9, v7 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9 +; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9 +; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v6, v6 +; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1 +; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f32_e32 v7, v6 +; GFX11-NEXT: s_denorm_mode 15 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7 +; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7 +; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX11-NEXT: s_denorm_mode 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <4 x float> addrspace(1)* %in2) #0 { %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16 @@ -2835,6 +3349,59 @@ ; GFX10-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: frem_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7] +; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX11-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] +; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] +; GFX11-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] +; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] +; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x double> addrspace(1)* %in2) #0 { %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4 %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -4,6 +4,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone @@ -88,6 +89,23 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshl_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 +; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_not_b32 s3, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z) store i32 %0, i32 addrspace(1)* %in @@ -152,6 +170,18 @@ ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshl_i32_imm: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) store i32 %0, i32 addrspace(1)* %in @@ -263,6 +293,26 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s3, v3, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshl_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1 +; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1 +; GFX11-NEXT: s_lshr_b32 s5, s5, 1 +; GFX11-NEXT: s_not_b32 s3, s3 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_not_b32 s2, s2 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -336,6 +386,19 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 25 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshl_v2i32_imm: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -499,6 +562,34 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshl_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1 +; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1 +; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1 +; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1 +; GFX11-NEXT: s_lshr_b32 s2, s7, 1 +; GFX11-NEXT: s_not_b32 s3, s15 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_not_b32 s7, s14 +; GFX11-NEXT: s_lshr_b32 s5, s5, 1 +; GFX11-NEXT: s_not_b32 s9, s13 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_not_b32 s8, s12 +; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3 +; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7 +; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9 +; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, <4 x i32> addrspace(1)* %in @@ -590,6 +681,21 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 31 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshl_v4i32_imm: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31 +; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23 +; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) store <4 x i32> %0, <4 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -4,6 +4,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=R600 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11 declare i32 @llvm.fshr.i32(i32, i32, i32) declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) @@ -83,6 +84,20 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z) store i32 %0, i32 addrspace(1)* %in @@ -147,6 +162,18 @@ ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_i32_imm: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7) store i32 %0, i32 addrspace(1)* %in @@ -232,6 +259,22 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v2i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -305,6 +348,19 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, s6, 7 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v2i32_imm: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> ) store <2 x i32> %0, <2 x i32> addrspace(1)* %in @@ -416,6 +472,26 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, v5 ; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v4i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 +; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, v0 +; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) store <4 x i32> %0, <4 x i32> addrspace(1)* %in @@ -505,6 +581,21 @@ ; GFX10-NEXT: v_alignbit_b32 v0, s4, s8, 1 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fshr_v4i32_imm: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1 +; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9 +; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7 +; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> ) store <4 x i32> %0, <4 x i32> addrspace(1)* %in @@ -529,6 +620,13 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) ret i32 %ret } @@ -553,6 +651,14 @@ ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) ret <2 x i32> %ret } @@ -579,6 +685,15 @@ ; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 ; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX11-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) ret <3 x i32> %ret } @@ -607,6 +722,16 @@ ; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 ; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX11-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX11-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) ret <4 x i32> %ret } @@ -655,6 +780,18 @@ ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v0, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) ret i16 %ret } @@ -722,6 +859,21 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX11-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v3, 0xf000f, v3 +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) ret <2 x i16> %ret } @@ -823,6 +975,35 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v6 +; GFX11-NEXT: v_lshlrev_b16 v7, 1, v7 +; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX11-NEXT: v_lshrrev_b16 v4, v6, v9 +; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX11-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v1, v2, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) ret <3 x i16> %ret } @@ -955,6 +1136,45 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 1, v1 +; GFX11-NEXT: v_xor_b32_e32 v11, -1, v5 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX11-NEXT: v_lshlrev_b16 v8, 1, v8 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX11-NEXT: v_lshlrev_b16 v10, 1, v10 +; GFX11-NEXT: v_xor_b32_e32 v14, -1, v9 +; GFX11-NEXT: v_lshlrev_b16 v1, v11, v1 +; GFX11-NEXT: v_lshlrev_b16 v0, v12, v0 +; GFX11-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX11-NEXT: v_lshrrev_b16 v3, v5, v3 +; GFX11-NEXT: v_lshlrev_b16 v4, v7, v8 +; GFX11-NEXT: v_lshrrev_b16 v5, v9, v13 +; GFX11-NEXT: v_lshlrev_b16 v7, v14, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v7, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret } @@ -1009,6 +1229,20 @@ ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_not_b32_e32 v5, v4 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) ret i64 %ret } @@ -1087,6 +1321,27 @@ ; GFX10-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: v_not_b32_e32 v9, v8 +; GFX11-NEXT: v_not_b32_e32 v11, v10 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[6:7], v10, v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v11, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) ret <2 x i64> %ret } @@ -1152,6 +1407,23 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 ; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 8, v2 +; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) ret i24 %ret } @@ -1249,6 +1521,34 @@ ; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_fshr_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 +; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 4, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-SCRATCH %s declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0 @@ -150,6 +151,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i1_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -245,6 +275,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i1_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_signext: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -343,6 +403,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i1_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_zeroext: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -435,6 +525,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i8_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -525,6 +643,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i8_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -617,6 +764,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i8_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -707,6 +883,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -797,6 +1001,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i16_signext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -889,6 +1122,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i16_zeroext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -979,6 +1241,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1069,6 +1359,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1162,6 +1480,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1259,6 +1607,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1358,6 +1735,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1465,6 +1872,37 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1561,6 +1999,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_f16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1649,6 +2115,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1739,6 +2233,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1832,6 +2354,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1930,6 +2481,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v5f32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2024,6 +2605,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2119,6 +2728,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2220,6 +2858,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f64_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2313,6 +2981,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2402,6 +3098,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2491,6 +3215,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2582,6 +3334,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2673,6 +3453,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2762,6 +3571,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2853,6 +3690,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i16_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2942,6 +3808,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3031,6 +3925,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3122,6 +4044,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3215,6 +4165,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_mov_b32_e32 v2, 5 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3311,6 +4290,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 +; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3402,6 +4410,34 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3497,6 +4533,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3596,6 +4661,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v5i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v4, 5 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3697,6 +4792,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3806,6 +4934,37 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v8i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 +; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 +; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3914,6 +5073,41 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b128 v[0:3], v12, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4033,6 +5227,45 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v28, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v28, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[8:11], v28, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[12:15], v28, s[0:1] offset:48 +; GFX11-NEXT: global_load_b128 v[16:19], v28, s[0:1] offset:64 +; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 +; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 +; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4162,6 +5395,47 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v32i32_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v28, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: global_load_b32 v32, v[0:1], off +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x7 +; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1] +; GFX11-NEXT: global_load_b128 v[4:7], v28, s[0:1] offset:16 +; GFX11-NEXT: global_load_b128 v[8:11], v28, s[0:1] offset:32 +; GFX11-NEXT: global_load_b128 v[12:15], v28, s[0:1] offset:48 +; GFX11-NEXT: global_load_b128 v[16:19], v28, s[0:1] offset:64 +; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 +; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 +; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: scratch_store_b32 off, v32, s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4284,6 +5558,42 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_i32_func_i32_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_i32_func_i32_imm: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4391,6 +5701,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] +; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4494,6 +5837,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 off, v0, s33 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-NEXT: v_mov_b32_e32 v0, s33 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_byval_struct_i8_i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4618,6 +5993,46 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_add_i32 vcc_lo, s33, 8 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b8 off, v0, s33 +; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 +; GFX11-NEXT: v_dual_mov_b32 v0, vcc_lo :: v_dual_mov_b32 v1, s33 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u8 v0, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v1, off, s33 offset:12 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4778,6 +6193,53 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i8@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i8@rel32@hi+12 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v8, v2 +; GFX11-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18 +; GFX11-NEXT: v_dual_mov_b32 v1, v16 :: v_dual_mov_b32 v2, v17 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i8: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5024,6 +6486,97 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: tail_call_byval_align16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:24 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v31, off, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_writelane_b32 v40, s34, 2 +; GFX11-NEXT: v_writelane_b32 v40, s35, 3 +; GFX11-NEXT: v_writelane_b32 v40, s36, 4 +; GFX11-NEXT: v_writelane_b32 v40, s37, 5 +; GFX11-NEXT: v_writelane_b32 v40, s38, 6 +; GFX11-NEXT: v_writelane_b32 v40, s39, 7 +; GFX11-NEXT: v_writelane_b32 v40, s40, 8 +; GFX11-NEXT: v_writelane_b32 v40, s41, 9 +; GFX11-NEXT: v_writelane_b32 v40, s42, 10 +; GFX11-NEXT: v_writelane_b32 v40, s43, 11 +; GFX11-NEXT: v_writelane_b32 v40, s44, 12 +; GFX11-NEXT: v_writelane_b32 v40, s45, 13 +; GFX11-NEXT: v_writelane_b32 v40, s46, 14 +; GFX11-NEXT: v_writelane_b32 v40, s47, 15 +; GFX11-NEXT: v_writelane_b32 v40, s48, 16 +; GFX11-NEXT: v_writelane_b32 v40, s49, 17 +; GFX11-NEXT: v_writelane_b32 v40, s50, 18 +; GFX11-NEXT: v_writelane_b32 v40, s51, 19 +; GFX11-NEXT: v_writelane_b32 v40, s52, 20 +; GFX11-NEXT: v_writelane_b32 v40, s53, 21 +; GFX11-NEXT: v_writelane_b32 v40, s54, 22 +; GFX11-NEXT: v_writelane_b32 v40, s55, 23 +; GFX11-NEXT: v_writelane_b32 v40, s56, 24 +; GFX11-NEXT: v_writelane_b32 v40, s57, 25 +; GFX11-NEXT: v_writelane_b32 v40, s58, 26 +; GFX11-NEXT: v_writelane_b32 v40, s59, 27 +; GFX11-NEXT: v_writelane_b32 v40, s60, 28 +; GFX11-NEXT: v_writelane_b32 v40, s61, 29 +; GFX11-NEXT: v_writelane_b32 v40, s62, 30 +; GFX11-NEXT: v_writelane_b32 v40, s63, 31 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s63, v40, 31 +; GFX11-NEXT: v_readlane_b32 s62, v40, 30 +; GFX11-NEXT: v_readlane_b32 s61, v40, 29 +; GFX11-NEXT: v_readlane_b32 s60, v40, 28 +; GFX11-NEXT: v_readlane_b32 s59, v40, 27 +; GFX11-NEXT: v_readlane_b32 s58, v40, 26 +; GFX11-NEXT: v_readlane_b32 s57, v40, 25 +; GFX11-NEXT: v_readlane_b32 s56, v40, 24 +; GFX11-NEXT: v_readlane_b32 s55, v40, 23 +; GFX11-NEXT: v_readlane_b32 s54, v40, 22 +; GFX11-NEXT: v_readlane_b32 s53, v40, 21 +; GFX11-NEXT: v_readlane_b32 s52, v40, 20 +; GFX11-NEXT: v_readlane_b32 s51, v40, 19 +; GFX11-NEXT: v_readlane_b32 s50, v40, 18 +; GFX11-NEXT: v_readlane_b32 s49, v40, 17 +; GFX11-NEXT: v_readlane_b32 s48, v40, 16 +; GFX11-NEXT: v_readlane_b32 s47, v40, 15 +; GFX11-NEXT: v_readlane_b32 s46, v40, 14 +; GFX11-NEXT: v_readlane_b32 s45, v40, 13 +; GFX11-NEXT: v_readlane_b32 s44, v40, 12 +; GFX11-NEXT: v_readlane_b32 s43, v40, 11 +; GFX11-NEXT: v_readlane_b32 s42, v40, 10 +; GFX11-NEXT: v_readlane_b32 s41, v40, 9 +; GFX11-NEXT: v_readlane_b32 s40, v40, 8 +; GFX11-NEXT: v_readlane_b32 s39, v40, 7 +; GFX11-NEXT: v_readlane_b32 s38, v40, 6 +; GFX11-NEXT: v_readlane_b32 s37, v40, 5 +; GFX11-NEXT: v_readlane_b32 s36, v40, 4 +; GFX11-NEXT: v_readlane_b32 s35, v40, 3 +; GFX11-NEXT: v_readlane_b32 s34, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: s_mov_b32 s33, s4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:24 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: tail_call_byval_align16: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5181,6 +6734,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i1_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 1 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5274,6 +6856,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i8_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5368,6 +6980,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i16_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5462,6 +7104,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 42 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5562,6 +7234,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_i64_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5673,6 +7378,43 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i64_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5793,6 +7535,45 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i64_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5922,6 +7703,49 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i64_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 8 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 1 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 7 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 +; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 8 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6070,6 +7894,55 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i64_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 1 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 2 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: s_mov_b32 s10, 3 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_mov_b32 s11, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6185,6 +8058,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_f16_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6279,6 +8182,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_f32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6379,6 +8312,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2f32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6488,6 +8454,42 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 5 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 +; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 5 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6612,6 +8614,48 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v5f32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 7 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 4.0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, -1.0 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 0.5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 5 +; GFX11-NEXT: v_writelane_b32 v40, s31, 6 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 7 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6724,6 +8768,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_f64_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6839,6 +8916,45 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2f64_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6972,6 +9088,51 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f64_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 8 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2.0 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 0x40100000 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 0 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 0x40200000 +; GFX11-NEXT: v_writelane_b32 v40, s30, 6 +; GFX11-NEXT: v_writelane_b32 v40, s31, 7 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 7 +; GFX11-NEXT: v_readlane_b32 s30, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 8 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7081,6 +9242,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7180,6 +9371,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7281,6 +9504,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7384,6 +9639,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i16_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 3 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7487,6 +9775,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3f16_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_movk_i32 s5, 0x4400 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7588,6 +9909,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7691,6 +10044,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i16_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 0x40003 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7788,6 +10174,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2f16_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7887,6 +10303,38 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7990,6 +10438,39 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v2i32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8099,6 +10580,42 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 5 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 3 +; GFX11-NEXT: v_writelane_b32 v40, s31, 4 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 4 +; GFX11-NEXT: v_readlane_b32 s30, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 5 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8217,6 +10734,45 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v3i32_i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 4 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 5 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 6 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8332,6 +10888,42 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8451,6 +11043,45 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v4i32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 6 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s30, 4 +; GFX11-NEXT: v_writelane_b32 v40, s31, 5 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 5 +; GFX11-NEXT: v_readlane_b32 s30, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 6 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8578,6 +11209,48 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v5i32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 7 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 5 +; GFX11-NEXT: v_writelane_b32 v40, s30, 5 +; GFX11-NEXT: v_writelane_b32 v40, s31, 6 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 6 +; GFX11-NEXT: v_readlane_b32 s30, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 7 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8716,6 +11389,52 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v8i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8870,6 +11589,57 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v8i32_imm_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_mov_b32 s5, 2 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: s_mov_b32 s6, 3 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: s_mov_b32 s7, 4 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: s_mov_b32 s8, 5 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: s_mov_b32 s9, 6 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: s_mov_b32 s10, 7 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: s_mov_b32 s11, 8 +; GFX11-NEXT: v_writelane_b32 v40, s30, 8 +; GFX11-NEXT: v_writelane_b32 v40, s31, 9 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 9 +; GFX11-NEXT: v_readlane_b32 s30, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9049,6 +11819,68 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v16i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 18 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: v_writelane_b32 v40, s12, 8 +; GFX11-NEXT: v_writelane_b32 v40, s13, 9 +; GFX11-NEXT: v_writelane_b32 v40, s14, 10 +; GFX11-NEXT: v_writelane_b32 v40, s15, 11 +; GFX11-NEXT: v_writelane_b32 v40, s16, 12 +; GFX11-NEXT: v_writelane_b32 v40, s17, 13 +; GFX11-NEXT: v_writelane_b32 v40, s18, 14 +; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 16 +; GFX11-NEXT: v_writelane_b32 v40, s31, 17 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 17 +; GFX11-NEXT: v_readlane_b32 s30, v40, 16 +; GFX11-NEXT: v_readlane_b32 s19, v40, 15 +; GFX11-NEXT: v_readlane_b32 s18, v40, 14 +; GFX11-NEXT: v_readlane_b32 s17, v40, 13 +; GFX11-NEXT: v_readlane_b32 s16, v40, 12 +; GFX11-NEXT: v_readlane_b32 s15, v40, 11 +; GFX11-NEXT: v_readlane_b32 s14, v40, 10 +; GFX11-NEXT: v_readlane_b32 s13, v40, 9 +; GFX11-NEXT: v_readlane_b32 s12, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 18 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9330,6 +12162,107 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v32i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 28 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: v_writelane_b32 v40, s12, 8 +; GFX11-NEXT: v_writelane_b32 v40, s13, 9 +; GFX11-NEXT: v_writelane_b32 v40, s14, 10 +; GFX11-NEXT: v_writelane_b32 v40, s15, 11 +; GFX11-NEXT: v_writelane_b32 v40, s16, 12 +; GFX11-NEXT: v_writelane_b32 v40, s17, 13 +; GFX11-NEXT: v_writelane_b32 v40, s18, 14 +; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s20, 16 +; GFX11-NEXT: v_writelane_b32 v40, s21, 17 +; GFX11-NEXT: v_writelane_b32 v40, s22, 18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51 +; GFX11-NEXT: v_writelane_b32 v40, s23, 19 +; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47 +; GFX11-NEXT: v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49 +; GFX11-NEXT: v_writelane_b32 v40, s24, 20 +; GFX11-NEXT: s_mov_b32 s20, s36 +; GFX11-NEXT: s_mov_b32 s21, s37 +; GFX11-NEXT: s_mov_b32 s22, s38 +; GFX11-NEXT: s_mov_b32 s23, s39 +; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s24, s40 +; GFX11-NEXT: s_mov_b32 s25, s41 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s32 offset:16 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: v_writelane_b32 v40, s26, 22 +; GFX11-NEXT: s_mov_b32 s26, s42 +; GFX11-NEXT: v_writelane_b32 v40, s27, 23 +; GFX11-NEXT: s_mov_b32 s27, s43 +; GFX11-NEXT: v_writelane_b32 v40, s28, 24 +; GFX11-NEXT: s_mov_b32 s28, s44 +; GFX11-NEXT: v_writelane_b32 v40, s29, 25 +; GFX11-NEXT: s_mov_b32 s29, s45 +; GFX11-NEXT: v_writelane_b32 v40, s30, 26 +; GFX11-NEXT: v_writelane_b32 v40, s31, 27 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 27 +; GFX11-NEXT: v_readlane_b32 s30, v40, 26 +; GFX11-NEXT: v_readlane_b32 s29, v40, 25 +; GFX11-NEXT: v_readlane_b32 s28, v40, 24 +; GFX11-NEXT: v_readlane_b32 s27, v40, 23 +; GFX11-NEXT: v_readlane_b32 s26, v40, 22 +; GFX11-NEXT: v_readlane_b32 s25, v40, 21 +; GFX11-NEXT: v_readlane_b32 s24, v40, 20 +; GFX11-NEXT: v_readlane_b32 s23, v40, 19 +; GFX11-NEXT: v_readlane_b32 s22, v40, 18 +; GFX11-NEXT: v_readlane_b32 s21, v40, 17 +; GFX11-NEXT: v_readlane_b32 s20, v40, 16 +; GFX11-NEXT: v_readlane_b32 s19, v40, 15 +; GFX11-NEXT: v_readlane_b32 s18, v40, 14 +; GFX11-NEXT: v_readlane_b32 s17, v40, 13 +; GFX11-NEXT: v_readlane_b32 s16, v40, 12 +; GFX11-NEXT: v_readlane_b32 s15, v40, 11 +; GFX11-NEXT: v_readlane_b32 s14, v40, 10 +; GFX11-NEXT: v_readlane_b32 s13, v40, 9 +; GFX11-NEXT: v_readlane_b32 s12, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 28 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9662,6 +12595,110 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: test_call_external_void_func_v32i32_i32_inreg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 28 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: v_writelane_b32 v40, s6, 2 +; GFX11-NEXT: v_writelane_b32 v40, s7, 3 +; GFX11-NEXT: v_writelane_b32 v40, s8, 4 +; GFX11-NEXT: v_writelane_b32 v40, s9, 5 +; GFX11-NEXT: v_writelane_b32 v40, s10, 6 +; GFX11-NEXT: v_writelane_b32 v40, s11, 7 +; GFX11-NEXT: v_writelane_b32 v40, s12, 8 +; GFX11-NEXT: v_writelane_b32 v40, s13, 9 +; GFX11-NEXT: v_writelane_b32 v40, s14, 10 +; GFX11-NEXT: v_writelane_b32 v40, s15, 11 +; GFX11-NEXT: v_writelane_b32 v40, s16, 12 +; GFX11-NEXT: v_writelane_b32 v40, s17, 13 +; GFX11-NEXT: v_writelane_b32 v40, s18, 14 +; GFX11-NEXT: v_writelane_b32 v40, s19, 15 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0x40 +; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s20, 16 +; GFX11-NEXT: v_writelane_b32 v40, s21, 17 +; GFX11-NEXT: v_writelane_b32 v40, s22, 18 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51 +; GFX11-NEXT: v_writelane_b32 v40, s23, 19 +; GFX11-NEXT: v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47 +; GFX11-NEXT: v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49 +; GFX11-NEXT: v_writelane_b32 v40, s24, 20 +; GFX11-NEXT: v_mov_b32_e32 v2, s48 +; GFX11-NEXT: s_mov_b32 s20, s36 +; GFX11-NEXT: s_mov_b32 s21, s37 +; GFX11-NEXT: s_mov_b32 s22, s38 +; GFX11-NEXT: v_writelane_b32 v40, s25, 21 +; GFX11-NEXT: s_mov_b32 s23, s39 +; GFX11-NEXT: s_mov_b32 s24, s40 +; GFX11-NEXT: s_mov_b32 s25, s41 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b32 off, v6, s32 offset:24 +; GFX11-NEXT: scratch_store_b64 off, v[4:5], s32 offset:16 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: v_writelane_b32 v40, s26, 22 +; GFX11-NEXT: s_mov_b32 s26, s42 +; GFX11-NEXT: v_writelane_b32 v40, s27, 23 +; GFX11-NEXT: s_mov_b32 s27, s43 +; GFX11-NEXT: v_writelane_b32 v40, s28, 24 +; GFX11-NEXT: s_mov_b32 s28, s44 +; GFX11-NEXT: v_writelane_b32 v40, s29, 25 +; GFX11-NEXT: s_mov_b32 s29, s45 +; GFX11-NEXT: v_writelane_b32 v40, s30, 26 +; GFX11-NEXT: v_writelane_b32 v40, s31, 27 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 27 +; GFX11-NEXT: v_readlane_b32 s30, v40, 26 +; GFX11-NEXT: v_readlane_b32 s29, v40, 25 +; GFX11-NEXT: v_readlane_b32 s28, v40, 24 +; GFX11-NEXT: v_readlane_b32 s27, v40, 23 +; GFX11-NEXT: v_readlane_b32 s26, v40, 22 +; GFX11-NEXT: v_readlane_b32 s25, v40, 21 +; GFX11-NEXT: v_readlane_b32 s24, v40, 20 +; GFX11-NEXT: v_readlane_b32 s23, v40, 19 +; GFX11-NEXT: v_readlane_b32 s22, v40, 18 +; GFX11-NEXT: v_readlane_b32 s21, v40, 17 +; GFX11-NEXT: v_readlane_b32 s20, v40, 16 +; GFX11-NEXT: v_readlane_b32 s19, v40, 15 +; GFX11-NEXT: v_readlane_b32 s18, v40, 14 +; GFX11-NEXT: v_readlane_b32 s17, v40, 13 +; GFX11-NEXT: v_readlane_b32 s16, v40, 12 +; GFX11-NEXT: v_readlane_b32 s15, v40, 11 +; GFX11-NEXT: v_readlane_b32 s14, v40, 10 +; GFX11-NEXT: v_readlane_b32 s13, v40, 9 +; GFX11-NEXT: v_readlane_b32 s12, v40, 8 +; GFX11-NEXT: v_readlane_b32 s11, v40, 7 +; GFX11-NEXT: v_readlane_b32 s10, v40, 6 +; GFX11-NEXT: v_readlane_b32 s9, v40, 5 +; GFX11-NEXT: v_readlane_b32 s8, v40, 4 +; GFX11-NEXT: v_readlane_b32 s7, v40, 3 +; GFX11-NEXT: v_readlane_b32 s6, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 28 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32_inreg: ; GFX10-SCRATCH: ; %bb.0: ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9844,6 +12881,35 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: stack_passed_arg_alignment_v32i32_f64: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -10013,6 +13079,52 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: stack_12xv3i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 +; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1 +; GFX11-NEXT: v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2 +; GFX11-NEXT: v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3 +; GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3 +; GFX11-NEXT: v_dual_mov_b32 v12, 4 :: v_dual_mov_b32 v13, 4 +; GFX11-NEXT: v_dual_mov_b32 v14, 4 :: v_dual_mov_b32 v15, 5 +; GFX11-NEXT: v_dual_mov_b32 v16, 5 :: v_dual_mov_b32 v17, 5 +; GFX11-NEXT: v_dual_mov_b32 v18, 6 :: v_dual_mov_b32 v19, 6 +; GFX11-NEXT: v_dual_mov_b32 v20, 6 :: v_dual_mov_b32 v21, 7 +; GFX11-NEXT: v_dual_mov_b32 v22, 7 :: v_dual_mov_b32 v23, 7 +; GFX11-NEXT: v_dual_mov_b32 v24, 8 :: v_dual_mov_b32 v25, 8 +; GFX11-NEXT: v_dual_mov_b32 v26, 8 :: v_dual_mov_b32 v27, 9 +; GFX11-NEXT: v_dual_mov_b32 v28, 9 :: v_dual_mov_b32 v29, 9 +; GFX11-NEXT: v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: stack_12xv3i32: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -10244,6 +13356,56 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: stack_8xv5i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 +; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 +; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 +; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1 +; GFX11-NEXT: v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v7, 1 +; GFX11-NEXT: v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v9, 1 +; GFX11-NEXT: v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v11, 2 +; GFX11-NEXT: v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v13, 2 +; GFX11-NEXT: v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v15, 3 +; GFX11-NEXT: v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v17, 3 +; GFX11-NEXT: v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v19, 3 +; GFX11-NEXT: v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v21, 4 +; GFX11-NEXT: v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v23, 4 +; GFX11-NEXT: v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v25, 5 +; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v27, 5 +; GFX11-NEXT: v_dual_mov_b32 v28, 5 :: v_dual_mov_b32 v29, 5 +; GFX11-NEXT: v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: stack_8xv5i32: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -10476,6 +13638,62 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-LABEL: stack_8xv5f32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x41400000 +; GFX11-NEXT: v_mov_b32_e32 v1, 0x41500000 +; GFX11-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GFX11-NEXT: v_mov_b32_e32 v3, 0x41700000 +; GFX11-NEXT: v_mov_b32_e32 v4, 0x41000000 +; GFX11-NEXT: v_mov_b32_e32 v5, 0x41100000 +; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 +; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 +; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 +; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v7, 1.0 :: v_dual_mov_b32 v8, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v9, 1.0 :: v_dual_mov_b32 v10, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v11, 2.0 :: v_dual_mov_b32 v12, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v13, 2.0 :: v_dual_mov_b32 v14, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v15, 0x40400000 :: v_dual_mov_b32 v16, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v17, 0x40400000 :: v_dual_mov_b32 v18, 0x40400000 +; GFX11-NEXT: v_dual_mov_b32 v19, 0x40400000 :: v_dual_mov_b32 v20, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v21, 4.0 :: v_dual_mov_b32 v22, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v23, 4.0 :: v_dual_mov_b32 v24, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v25, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000 +; GFX11-NEXT: v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v29, 0x40a00000 +; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX11-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GFX10-SCRATCH-LABEL: stack_8xv5f32: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s declare hidden amdgpu_gfx void @external_void_func_void() #0 @@ -71,6 +72,40 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 4 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s5, 1 +; GFX11-NEXT: s_getpc_b64 s[4:5] +; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 2 +; GFX11-NEXT: v_writelane_b32 v40, s31, 3 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 3 +; GFX11-NEXT: v_readlane_b32 s30, v40, 2 +; GFX11-NEXT: v_readlane_b32 s5, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 4 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "", ""() #0 call amdgpu_gfx void @external_void_func_void() @@ -86,9 +121,16 @@ ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: v_writelane_b32 v0, s28, 0 ; GFX9-NEXT: v_writelane_b32 v0, s29, 1 +; GFX9-NEXT: v_writelane_b32 v0, s30, 2 +; GFX9-NEXT: v_writelane_b32 v0, s31, 3 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: ;;#ASMSTART +; GFX9-NEXT: ; clobber +; GFX9-NEXT: ;;#ASMEND +; GFX9-NEXT: v_readlane_b32 s31, v0, 3 +; GFX9-NEXT: v_readlane_b32 s30, v0, 2 ; GFX9-NEXT: v_readlane_b32 s29, v0, 1 ; GFX9-NEXT: v_readlane_b32 s28, v0, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[34:35], -1 @@ -107,9 +149,16 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: v_writelane_b32 v0, s28, 0 ; GFX10-NEXT: v_writelane_b32 v0, s29, 1 +; GFX10-NEXT: v_writelane_b32 v0, s30, 2 +; GFX10-NEXT: v_writelane_b32 v0, s31, 3 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ; clobber +; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_readlane_b32 s31, v0, 3 +; GFX10-NEXT: v_readlane_b32 s30, v0, 2 ; GFX10-NEXT: v_readlane_b32 s29, v0, 1 ; GFX10-NEXT: v_readlane_b32 s28, v0, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 @@ -119,6 +168,36 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_void_clobber_s28_s29: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v0, s28, 0 +; GFX11-NEXT: v_writelane_b32 v0, s29, 1 +; GFX11-NEXT: v_writelane_b32 v0, s30, 2 +; GFX11-NEXT: v_writelane_b32 v0, s31, 3 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; clobber +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; clobber +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v0, 3 +; GFX11-NEXT: v_readlane_b32 s30, v0, 2 +; GFX11-NEXT: v_readlane_b32 s29, v0, 1 +; GFX11-NEXT: v_readlane_b32 s28, v0, 0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 ; GCN: v_writelane_b32 v0, s28, 0 ; GCN: v_writelane_b32 v0, s29, 1 @@ -201,6 +280,42 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_mayclobber_s31: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s31 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_mov_b32 s4, s31 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s31, s4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s31 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{s31}"(i32 %s31) @@ -280,6 +395,42 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_mayclobber_v31: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def v31 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_mov_b32_e32 v41, v31 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v31, v41 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v31 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %v31 = call i32 asm sideeffect "; def $0", "={v31}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{v31}"(i32 %v31) @@ -360,6 +511,43 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_preserves_s33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s33 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s33, s4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s33 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %s33 = call i32 asm sideeffect "; def $0", "={s33}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{s33}"(i32 %s33) @@ -439,6 +627,43 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_preserves_s34: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s34 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, s34 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_mov_b32 s34, s4 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s34 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %s34 = call i32 asm sideeffect "; def $0", "={s34}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{s34}"(i32 %s34) @@ -514,6 +739,40 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_preserves_v40: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v41, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def v40 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v40 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v41, 1 +; GFX11-NEXT: v_readlane_b32 s30, v41, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v41, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %v40 = call i32 asm sideeffect "; def $0", "={v40}"() call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "{v40}"(i32 %v40) @@ -558,6 +817,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_void_clobber_s33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v0, s33, 0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; clobber +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s33, v0, 0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 ret void } @@ -600,6 +879,26 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: void_func_void_clobber_s34: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v0, s34, 0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; clobber +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s34, v0, 0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 ret void } @@ -657,6 +956,33 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_clobber_s33: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s33@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @void_func_void_clobber_s33() ret void } @@ -714,6 +1040,33 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: test_call_void_func_void_clobber_s34: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s34@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void @void_func_void_clobber_s34() ret void } @@ -789,6 +1142,42 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: callee_saved_sgpr_kernel: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s40 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, s40 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 call amdgpu_gfx void @external_void_func_void() call void asm sideeffect "; use $0", "s"(i32 %s40) #0 @@ -884,6 +1273,50 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: callee_saved_sgpr_vgpr_kernel: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 3 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def s40 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, s40 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; def v32 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_mov_b32_e32 v41, v32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s4 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v41 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v40, 3 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 call amdgpu_gfx void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_gfx i1 @return_i1() #0 { ; GFX9-LABEL: return_i1: @@ -9,12 +10,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: return_i1: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: return_i1: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: ret i1 1 } @@ -76,6 +77,35 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: call_i1: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v1, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, return_i1@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, return_i1@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v1, s30, 0 +; GFX11-NEXT: v_writelane_b32 v1, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v1, 1 +; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v1, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: call amdgpu_gfx i1 @return_i1() ret void @@ -88,12 +118,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 10 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: return_i16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 10 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: return_i16: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 10 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: ret i16 10 } @@ -155,6 +185,35 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: call_i16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v1, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, return_i16@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, return_i16@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v1, s30, 0 +; GFX11-NEXT: v_writelane_b32 v1, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v1, 1 +; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v1, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: call amdgpu_gfx i16 @return_i16() ret void @@ -167,12 +226,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: return_2xi16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: return_2xi16: +; GFX10PLUS: ; %bb.0: ; %entry +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0x20001 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] entry: ret <2 x i16> } @@ -234,6 +293,35 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: call_2xi16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v1, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v1, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, return_2xi16@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v1, s30, 0 +; GFX11-NEXT: v_writelane_b32 v1, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v1, 1 +; GFX11-NEXT: v_readlane_b32 s30, v1, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v1, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v1, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: call amdgpu_gfx <2 x i16> @return_2xi16() ret void @@ -254,6 +342,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: return_3xi16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: ret <3 x i16> } @@ -315,6 +410,35 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: call_3xi16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v2, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, return_3xi16@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v2, s30, 0 +; GFX11-NEXT: v_writelane_b32 v2, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v2, 1 +; GFX11-NEXT: v_readlane_b32 s30, v2, 0 +; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: v_readlane_b32 s33, v2, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: call amdgpu_gfx <3 x i16> @return_3xi16() ret void @@ -1361,6 +1485,151 @@ ; GFX10-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: return_512xi32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_clause 0x3e +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2032 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2016 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:2000 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1984 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1968 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1952 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1936 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1920 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1904 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1888 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1872 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1856 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1840 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1824 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1808 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1792 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1776 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1760 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1744 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1728 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1712 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1696 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1680 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1664 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1648 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1632 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1616 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1600 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1584 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1568 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1552 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1536 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1520 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1504 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1488 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1472 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1456 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1440 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1424 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1408 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1392 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1376 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1360 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1344 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1328 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1312 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1296 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1280 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1264 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1248 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1232 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1216 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1200 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1184 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1168 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1152 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1136 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1120 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1104 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1088 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1072 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1056 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1040 +; GFX11-NEXT: s_clause 0x3e +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1024 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:1008 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:992 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:976 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:960 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:944 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:928 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:912 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:896 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:880 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:864 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:848 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:832 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:816 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:800 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:784 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:768 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:752 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:736 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:720 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:704 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:688 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:672 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:656 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:640 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:624 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:608 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:592 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:576 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:560 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:544 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:528 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:512 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:496 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:480 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:464 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:448 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:432 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:416 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:400 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:384 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:368 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:352 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:336 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:320 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:304 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:288 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:272 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:256 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:240 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:224 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:208 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:192 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:176 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:160 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:144 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:128 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:112 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: ret <512 x i32> zeroinitializer } @@ -1426,6 +1695,37 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: call_512xi32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v5, s32 offset:2048 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v5, s33, 2 +; GFX11-NEXT: s_add_i32 s33, s32, 0x7ff +; GFX11-NEXT: s_addk_i32 s32, 0x1800 +; GFX11-NEXT: s_and_b32 s33, s33, 0xfffff800 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, return_512xi32@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, return_512xi32@gotpcrel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v5, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v0, s33 +; GFX11-NEXT: v_writelane_b32 v5, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_readlane_b32 s31, v5, 1 +; GFX11-NEXT: v_readlane_b32 s30, v5, 0 +; GFX11-NEXT: s_addk_i32 s32, 0xe800 +; GFX11-NEXT: v_readlane_b32 s33, v5, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v5, off, s32 offset:2048 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: call amdgpu_gfx <512 x i32> @return_512xi32() ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -112,6 +113,35 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void @@ -209,6 +239,35 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB1_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -286,6 +345,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -362,6 +433,18 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -458,6 +541,35 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB4_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -571,6 +683,35 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_system: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef ret void @@ -720,6 +861,32 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB8_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -794,6 +961,17 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: infer_as_before_atomic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %load = load float*, float* addrspace(4)* %arg %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s ; Test using saddr addressing mode of global_* flat atomic instructions. @@ -22,6 +23,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -48,6 +59,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_2047: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:2047 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 @@ -75,6 +96,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 @@ -101,6 +132,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -131,6 +172,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_i32_rtn_2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 @@ -158,6 +209,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_i32_rtn_neg2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 @@ -203,6 +264,21 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_load_b64 v[2:3], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -242,6 +318,21 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_load_b64 v[2:3], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -282,6 +373,21 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_load_b64 v[2:3], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -320,6 +426,21 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_load_b64 v[2:3], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] offset:42 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -355,6 +476,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -381,6 +512,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xchg_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -408,6 +549,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -433,6 +584,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xchg_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_swap_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -463,6 +624,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_add_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -489,6 +660,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_add_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -516,6 +697,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_add_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -541,6 +732,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_add_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -567,6 +768,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_add_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -593,6 +804,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_add_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -620,6 +841,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_add_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -645,6 +876,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_add_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_add_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -675,6 +916,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sub_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -701,6 +952,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sub_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -728,6 +989,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_sub_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -753,6 +1024,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_sub_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -779,6 +1060,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sub_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -805,6 +1096,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sub_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -832,6 +1133,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_sub_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -857,6 +1168,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_sub_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_sub_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -887,6 +1208,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_and_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -913,6 +1244,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_and_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -940,6 +1281,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_and_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -965,6 +1316,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_and_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -991,6 +1352,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_and_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1017,6 +1388,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_and_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1044,6 +1425,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_and_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1069,6 +1460,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_and_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_and_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1099,6 +1500,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_or_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1125,6 +1536,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_or_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1152,6 +1573,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_or_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1177,6 +1608,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_or_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1203,6 +1644,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_or_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1229,6 +1680,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_or_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1256,6 +1717,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_or_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1281,6 +1752,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_or_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_or_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1311,6 +1792,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xor_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1337,6 +1828,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xor_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1364,6 +1865,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xor_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1389,6 +1900,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xor_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1415,6 +1936,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xor_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1441,6 +1972,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_xor_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1468,6 +2009,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xor_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1493,6 +2044,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_xor_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_xor_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1523,6 +2084,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_max_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1549,6 +2120,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1576,6 +2157,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_max_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1601,6 +2192,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1627,6 +2228,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_max_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1653,6 +2264,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1680,6 +2301,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_max_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1705,6 +2336,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_i64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1735,6 +2376,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_min_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1761,6 +2412,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1788,6 +2449,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_min_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1813,6 +2484,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1839,6 +2520,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_min_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1865,6 +2556,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1892,6 +2593,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_min_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1917,6 +2628,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_i64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1947,6 +2668,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umax_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1973,6 +2704,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2000,6 +2741,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umax_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2025,6 +2776,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2051,6 +2812,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umax_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2077,6 +2848,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2104,6 +2885,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umax_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2129,6 +2920,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_max_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2159,6 +2960,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umin_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2185,6 +2996,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2212,6 +3033,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umin_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2237,6 +3068,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2263,6 +3104,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umin_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2289,6 +3140,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2316,6 +3177,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umin_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2341,6 +3212,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_min_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2373,6 +3254,17 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2402,6 +3294,17 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2432,6 +3335,17 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2459,6 +3373,17 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2489,6 +3414,18 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2520,6 +3457,18 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2552,6 +3501,18 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2581,6 +3542,18 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b64 v0, v[3:6], s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2602,6 +3575,12 @@ ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_inc_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2616,6 +3595,12 @@ ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] offset:-128 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_inc_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2630,6 +3615,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_inc_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2642,6 +3633,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_inc_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2656,6 +3653,12 @@ ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_inc_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2670,6 +3673,12 @@ ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_inc_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2684,6 +3693,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_inc_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2696,6 +3711,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_inc_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2717,6 +3738,12 @@ ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_dec_saddr_i32_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2731,6 +3758,12 @@ ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] offset:-128 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_dec_saddr_i32_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2745,6 +3778,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_dec_saddr_i32_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -2757,6 +3796,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_dec_saddr_i32_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2771,6 +3816,12 @@ ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_dec_saddr_i64_rtn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2785,6 +3836,12 @@ ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_dec_saddr_i64_rtn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2799,6 +3856,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_dec_saddr_i64_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -2811,6 +3874,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_dec_saddr_i64_nortn_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s ; Test using saddr addressing mode of global_*load_* flat instructions. @@ -16,6 +17,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %load = load i8, i8 addrspace(1)* %sbase %zext = zext i8 %load to i32 %to.vgpr = bitcast i32 %zext to float @@ -37,6 +45,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4095: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -52,6 +67,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4096: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -67,6 +89,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4097: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -90,6 +119,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg4096: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -115,6 +151,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg4097: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -140,6 +185,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg4098: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -162,6 +216,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -184,6 +245,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_2049: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2049 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -206,6 +274,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_2050: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2050 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -221,6 +296,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -244,6 +326,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg2049: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -267,6 +356,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg2050: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2050 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -288,6 +384,13 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4294967295: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0xfffff000 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -312,6 +415,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4294967296: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -336,6 +448,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4294967297: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -360,6 +481,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4294971391: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -384,6 +514,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_4294971392: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -409,6 +548,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967295: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -433,6 +581,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967296: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -457,6 +614,15 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_neg4294967297: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], 0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s[0:1] +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297 %load = load i8, i8 addrspace(1)* %gep0 %zext = zext i8 %load to i32 @@ -475,6 +641,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -500,6 +672,12 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 @@ -531,6 +709,18 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] +; GFX11-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096 @@ -557,6 +747,12 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096 @@ -588,6 +784,18 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, s[0:1], s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1] +; GFX11-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097 @@ -604,6 +812,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 @@ -630,6 +844,12 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 @@ -646,6 +866,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 @@ -672,6 +898,12 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2049 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049 @@ -698,6 +930,12 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset @@ -714,6 +952,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %sbase.as.int, %zext.offset @@ -731,6 +975,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -748,6 +998,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -766,6 +1022,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add.immoffset = add i64 %sbase.as.int, 128 @@ -807,6 +1069,17 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: ds_load_b64 v[1:2], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -840,6 +1113,17 @@ ; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: ds_load_b64 v[1:2], v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:42 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -858,6 +1142,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -874,6 +1165,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-24 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24 @@ -891,6 +1189,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -909,6 +1214,13 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -937,6 +1249,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_i8_vgpr64_sgpr32: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -964,6 +1284,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v0, vcc, v0, s2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 @@ -1003,6 +1331,20 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f32_natural_addressing: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset @@ -1019,6 +1361,14 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f32_natural_addressing_immoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1038,6 +1388,15 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset @@ -1055,6 +1414,15 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:400 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset @@ -1089,6 +1457,20 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1 %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset @@ -1106,6 +1488,12 @@ ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -1120,6 +1508,12 @@ ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1135,6 +1529,12 @@ ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* @@ -1148,6 +1548,12 @@ ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1162,6 +1568,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1176,6 +1588,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1191,6 +1609,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* @@ -1204,6 +1628,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1218,6 +1648,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)* @@ -1232,6 +1668,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2i16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1247,6 +1689,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* @@ -1260,6 +1708,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2f16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1274,6 +1728,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_p3: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* @@ -1289,6 +1749,12 @@ ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_p3_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1305,6 +1771,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* @@ -1319,6 +1791,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_f64_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1334,6 +1812,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1348,6 +1832,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i64_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1363,6 +1853,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* @@ -1376,6 +1872,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2f32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1390,6 +1892,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* @@ -1404,6 +1912,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2i32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1419,6 +1933,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* @@ -1433,6 +1953,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4i16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1448,6 +1974,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* @@ -1462,6 +1994,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4f16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1477,6 +2015,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_p1: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* @@ -1492,6 +2036,12 @@ ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_p1_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1508,6 +2058,12 @@ ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* @@ -1521,6 +2077,12 @@ ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v3f32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1535,6 +2097,12 @@ ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* @@ -1549,6 +2117,12 @@ ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v3i32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1564,6 +2138,12 @@ ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* @@ -1577,6 +2157,12 @@ ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v6f16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1591,6 +2177,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* @@ -1604,6 +2196,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4f32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1618,6 +2216,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* @@ -1632,6 +2236,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4i32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1647,6 +2257,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* @@ -1661,6 +2277,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2i64_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1676,6 +2298,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)* @@ -1690,6 +2318,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i128_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1705,6 +2339,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2p1: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* @@ -1720,6 +2360,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v2p1_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1736,6 +2382,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4p3: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* @@ -1751,6 +2403,12 @@ ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_v4p3_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1771,6 +2429,12 @@ ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sextload_saddr_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -1785,6 +2449,12 @@ ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sextload_saddr_i8_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_i8 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1800,6 +2470,12 @@ ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sextload_saddr_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -1815,6 +2491,12 @@ ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_sextload_saddr_i16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_i16 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1831,6 +2513,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_zextload_saddr_i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -1845,6 +2533,12 @@ ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_zextload_saddr_i8_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1860,6 +2554,12 @@ ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_zextload_saddr_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -1875,6 +2575,12 @@ ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_zextload_saddr_i16_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1907,6 +2613,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: atomic_global_load_saddr_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -1933,6 +2649,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: atomic_global_load_saddr_i32_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1960,6 +2686,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: atomic_global_load_saddr_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1986,6 +2722,16 @@ ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: atomic_global_load_saddr_i64_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:-128 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2005,6 +2751,12 @@ ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2020,6 +2772,12 @@ ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2038,6 +2796,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2055,6 +2821,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2072,6 +2846,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2088,6 +2869,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2105,6 +2893,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2122,6 +2917,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_u8 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2140,6 +2942,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2157,6 +2966,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_i8 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2178,6 +2994,12 @@ ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2193,6 +3015,12 @@ ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2211,6 +3039,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2228,6 +3064,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2245,6 +3089,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2261,6 +3112,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2278,6 +3136,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2295,6 +3160,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_u8 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2313,6 +3185,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2330,6 +3209,13 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_d16_hi_i8 v1, v0, s[2:3] offset:-128 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2354,6 +3240,14 @@ ; GCN-NEXT: global_load_ubyte v0, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_or_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 16 %addr = inttoptr i64 %or to i8 addrspace(1)* @@ -2371,6 +3265,14 @@ ; GCN-NEXT: global_load_ubyte v0, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog %zext.idx = zext i32 %idx to i64 %or = or i64 %zext.idx, 4160 %addr = inttoptr i64 %or to i8 addrspace(1)* @@ -2419,6 +3321,24 @@ ; GFX10-NEXT: s_cbranch_scc0 .LBB128_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_addr_64bit_lsr_iv: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: .LBB128_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s4, s2, s0 +; GFX11-NEXT: s_addc_u32 s5, s3, s1 +; GFX11-NEXT: s_add_u32 s0, s0, 4 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX11-NEXT: s_cbranch_scc0 .LBB128_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -2478,6 +3398,26 @@ ; GFX10-NEXT: s_cbranch_scc0 .LBB129_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_addr_64bit_lsr_iv_multiload: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: .LBB129_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s4, s2, s0 +; GFX11-NEXT: s_addc_u32 s5, s3, s1 +; GFX11-NEXT: s_add_u32 s0, s0, 4 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400 +; GFX11-NEXT: s_cbranch_scc0 .LBB129_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_endpgm bb: br label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s ; Test using saddr addressing mode of global_*store_* flat instructions. @@ -11,6 +12,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_byte v0, v2, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i8_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -26,6 +35,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_byte v0, v2, s[2:3] offset:2047 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] offset:2047 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -42,6 +59,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_byte v0, v2, s[2:3] offset:-2048 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] offset:-2048 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -78,6 +103,17 @@ ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_load_b64 v[2:3], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -107,6 +143,17 @@ ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1] offset:-120 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: ds_load_b64 v[2:3], v2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] offset:-120 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -124,6 +171,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_short v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -136,6 +189,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_short v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -149,6 +208,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_short v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_f16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* @@ -161,6 +226,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_short v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -174,6 +245,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -186,6 +263,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -199,6 +282,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_f32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* @@ -211,6 +300,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -224,6 +319,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dword v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_p3_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* @@ -236,6 +337,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -249,6 +356,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i64_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -261,6 +374,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -274,6 +393,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_f64_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* @@ -286,6 +411,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -299,6 +430,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* @@ -311,6 +448,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -324,6 +467,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* @@ -336,6 +485,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -349,6 +504,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* @@ -361,6 +522,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -374,6 +541,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* @@ -386,6 +559,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -399,6 +578,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_p1_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* @@ -411,6 +596,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -424,6 +615,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* @@ -436,6 +633,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -449,6 +652,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* @@ -461,6 +670,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -474,6 +689,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x i16> addrspace(1)* @@ -486,6 +707,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -499,6 +726,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* @@ -511,6 +744,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -524,6 +763,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* @@ -536,6 +781,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -549,6 +800,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* @@ -561,6 +818,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -574,6 +837,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* @@ -586,6 +855,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -599,6 +874,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x double> addrspace(1)* @@ -611,6 +892,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -624,6 +911,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x i16> addrspace(1)* @@ -636,6 +929,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -649,6 +948,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x half> addrspace(1)* @@ -661,6 +966,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -674,6 +985,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* @@ -686,6 +1003,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -699,6 +1022,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* @@ -711,6 +1040,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -736,6 +1071,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_global_store_saddr_i32_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -756,6 +1099,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -777,6 +1128,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_global_store_saddr_i64_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -797,6 +1156,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -814,6 +1181,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_short_d16_hi v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -827,6 +1200,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_short_d16_hi v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -841,6 +1220,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %data.hi = extractelement <2 x i16> %data, i32 1 @@ -854,6 +1239,12 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: @@ -87,6 +88,60 @@ ; GFX10-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: udiv32_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: s_sub_i32 s5, 0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, s5, v0 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB0_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX11-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_not_b32_e32 v3, v2 +; GFX11-NEXT: v_mul_lo_u32 v4, s5, v2 +; GFX11-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, s2, v3 +; GFX11-NEXT: s_add_u32 s2, s2, 1 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 1, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, 4 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -185,6 +240,57 @@ ; GFX10-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: urem32_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: s_sub_i32 s5, 0, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v1, s5, v0 +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB1_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX11-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_not_b32_e32 v3, v2 +; GFX11-NEXT: v_mul_lo_u32 v2, s5, v2 +; GFX11-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, s2, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, s2, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2 +; GFX11-NEXT: s_add_u32 s2, s2, 1 +; GFX11-NEXT: s_addc_u32 s3, s3, 0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, s4, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, 4 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x400 +; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -289,6 +395,60 @@ ; GFX10-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sdiv32_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_ashr_i32 s2, s3, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s3, s3, s2 +; GFX11-NEXT: s_xor_b32 s3, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX11-NEXT: s_sub_i32 s4, 0, s3 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB2_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v2, s4, v0 +; GFX11-NEXT: v_mul_lo_u32 v3, v2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX11-NEXT: s_add_i32 s4, s4, 1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v5, s3, v3 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v3, v5 :: v_dual_add_nc_u32 v4, 1, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, 4 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -385,6 +545,55 @@ ; GFX10-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: srem32_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_ashr_i32 s3, s2, 31 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s2, s2, s3 +; GFX11-NEXT: s_xor_b32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: s_sub_i32 s3, 0, s2 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB3_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v2, s3, v0 +; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, s3, v2 +; GFX11-NEXT: s_add_i32 s3, s3, 1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, 4 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400 +; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -467,6 +676,45 @@ ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: udiv16_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s0, 0xffff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB4_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v4, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v0, v7, v3 +; GFX11-NEXT: v_add_co_u32 v5, s0, s2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX11-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f32 v7, -v0, v2, v7 +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v0, s0, 0, v0, s0 +; GFX11-NEXT: global_store_b16 v[5:6], v0, off +; GFX11-NEXT: s_cbranch_vccz .LBB4_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -551,6 +799,48 @@ ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: urem16_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB5_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v4, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v8, v7, v3 +; GFX11-NEXT: v_add_co_u32 v5, s0, s2, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_trunc_f32_e32 v8, v8 +; GFX11-NEXT: v_fma_f32 v7, -v8, v2, v7 +; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v7, v7, s1 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v7 +; GFX11-NEXT: global_store_b16 v[5:6], v0, off +; GFX11-NEXT: s_cbranch_vccz .LBB5_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -643,6 +933,54 @@ ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sdiv16_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s4, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB6_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_bfe_i32 v5, v4, 0, 16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v4, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cvt_f32_i32_e32 v7, v5 +; GFX11-NEXT: v_xor_b32_e32 v8, s4, v5 +; GFX11-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX11-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_f32_e32 v0, v7, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_ashrrev_i32_e32 v8, 30, v8 +; GFX11-NEXT: v_add_co_u32 v5, s0, s2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 1, v8 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f32 v7, -v0, v2, v7 +; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v2| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v7 +; GFX11-NEXT: global_store_b16 v[5:6], v0, off +; GFX11-NEXT: s_cbranch_vccz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 @@ -737,6 +1075,55 @@ ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: srem16_invariant_denom: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sext_i32_i16 s1, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB7_1: ; %bb3 +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_bfe_i32 v7, v4, 0, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_i32_e32 v5, v7 +; GFX11-NEXT: v_xor_b32_e32 v6, s1, v7 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f32_e32 v8, v5, v3 +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 30, v6 +; GFX11-NEXT: v_trunc_f32_e32 v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v6, 1, v6 +; GFX11-NEXT: v_fma_f32 v5, -v8, v2, v5 +; GFX11-NEXT: v_cvt_i32_f32_e32 v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v2| +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v4, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo +; GFX11-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v0, v8, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v5, s0, s2, v5 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, v7, v0 +; GFX11-NEXT: global_store_b16 v[5:6], v0, off +; GFX11-NEXT: s_cbranch_vccz .LBB7_1 +; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm bb: br label %bb3 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s @@ -17,6 +18,18 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_neg_0.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_neg_0.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -53,6 +66,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_0.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_0.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -87,6 +111,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_imm_neg_0.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_imm_neg_0.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -121,6 +156,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_0.5_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -155,6 +201,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_m_0.5_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_m_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -189,6 +246,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_1.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -223,6 +291,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_m_1.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_m_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -257,6 +336,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_2.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -291,6 +381,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_m_2.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_m_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -325,6 +426,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_4.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -359,6 +471,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_m_4.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_m_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -393,6 +516,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_inv_2pi_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_inv_2pi_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -427,6 +561,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_inline_imm_m_inv_2pi_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -461,6 +606,17 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: store_literal_imm_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: store_literal_imm_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00] @@ -497,6 +653,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_0.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_0.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -538,6 +707,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_0.5_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -579,6 +761,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_0.5_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -620,6 +815,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_1.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -661,6 +869,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_1.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_1.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -702,6 +923,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_2.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -743,6 +977,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_2.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_2.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -784,6 +1031,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_4.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -825,6 +1085,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_4.0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_4.0_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -872,6 +1145,25 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: commute_add_inline_imm_0.5_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64] +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: commute_add_inline_imm_0.5_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] @@ -934,6 +1226,25 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: commute_add_literal_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00] +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: commute_add_literal_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] @@ -990,6 +1301,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_1_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -1031,6 +1355,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_2_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_2_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -1072,6 +1409,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_16_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_16_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -1119,6 +1469,25 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_1_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00] +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_1_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] @@ -1180,6 +1549,25 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_2_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00] +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_2_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] @@ -1241,6 +1629,25 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_neg_16_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe] +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe] +; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe] +; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe] +; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe] +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80] +; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe] +; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] +; GFX11-NEXT: v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00] +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_neg_16_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00] @@ -1296,6 +1703,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_63_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_63_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -1337,6 +1757,19 @@ ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] ; +; GFX11-LABEL: add_inline_imm_64_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf] +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8] +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] +; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] +; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] +; ; VI-LABEL: add_inline_imm_64_f16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00] @@ -1377,6 +1810,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_0.5_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_0.5_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1412,6 +1854,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_neg_0.5_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_neg_0.5_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1447,6 +1898,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_1.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_1.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1482,6 +1942,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_neg_1.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_neg_1.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1517,6 +1986,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: shl_inline_imm_2.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: shl_inline_imm_2.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1552,6 +2030,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: shl_inline_imm_neg_2.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: shl_inline_imm_neg_2.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1587,6 +2074,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_4.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_4.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1622,6 +2118,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_neg_4.0_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_neg_4.0_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] @@ -1657,6 +2162,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; +; GFX11-LABEL: mul_inline_imm_inv2pi_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00] +; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00] +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] +; ; VI-LABEL: mul_inline_imm_inv2pi_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/insert-branch-w32.mir b/llvm/test/CodeGen/AMDGPU/insert-branch-w32.mir --- a/llvm/test/CodeGen/AMDGPU/insert-branch-w32.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-branch-w32.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass branch-folder -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass branch-folder -o - %s | FileCheck %s # Designed to provoke calling SIInstrInfo::insertBranch in wave32 mode # The implicit $vcc operand should be $vcc_lo in this case diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_0: @@ -29,6 +30,19 @@ ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_lh_b32_b16 s2, 0x3e7, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -81,6 +95,21 @@ ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_0_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_lh_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -144,6 +173,25 @@ ; CI-NEXT: ; use s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s1 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt1 = extractelement <2 x i16> %vec, i32 1 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -197,6 +245,21 @@ ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_0_reghi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_hh_b32_b16 s0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -260,6 +323,25 @@ ; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_lh_b32_b16 s1, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -335,6 +417,29 @@ ; CI-NEXT: ; use s1 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s1 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -375,6 +480,19 @@ ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x3e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -426,6 +544,21 @@ ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_1_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out @@ -459,6 +592,20 @@ ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 store <2 x half> %vecins, <2 x half> addrspace(1)* %out @@ -491,6 +638,19 @@ ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 ; CIVI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2f16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, 0x4500 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 store <2 x half> %vecins, <2 x half> addrspace(1)* %out @@ -546,6 +706,19 @@ ; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2i16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_movk_i32 s2, 0x3e7 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -609,6 +782,21 @@ ; CI-NEXT: v_or_b32_e32 v2, s0, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2i16_0_reghi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: v_lshrrev_b32_e64 v2, 16, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v1, 0xffff0000, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -669,6 +857,18 @@ ; CI-NEXT: v_or_b32_e32 v2, 53, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2i16_0_inlineimm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -729,6 +929,20 @@ ; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2i16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, 0x3e7, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -787,6 +1001,20 @@ ; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, -15, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -846,6 +1074,20 @@ ; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 0x4500 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -904,6 +1146,20 @@ ; CI-NEXT: v_or_b32_e32 v2, 53, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2f16_0_inlineimm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, 53 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -963,6 +1219,20 @@ ; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2f16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, 0x4500, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -1021,6 +1291,20 @@ ; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, 35, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -1088,6 +1372,27 @@ ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: s_insertelement_v2i16_dynamic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s3, s4, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %idx = load volatile i32, i32 addrspace(4)* %idx.ptr %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx @@ -1152,6 +1457,22 @@ ; CI-NEXT: v_bfi_b32 v2, s0, v2, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2i16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: s_lshl_b32 s0, s0, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1233,6 +1554,26 @@ ; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v2f16_dynamic_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x10 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 0xffff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext @@ -1298,6 +1639,19 @@ ; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -1363,6 +1717,21 @@ ; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -1428,6 +1797,19 @@ ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x30 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -1493,6 +1875,21 @@ ; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -1558,6 +1955,19 @@ ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4i16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1644,6 +2054,29 @@ ; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0xffff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, v3, s0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1730,6 +2163,24 @@ ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v4f16_dynamic_sgpr: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_mov_b64 s[2:3], 0xffff +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] +; GFX11-NEXT: s_lshl_b32 s1, s1, 4 +; GFX11-NEXT: s_pack_ll_b32_b16 s6, s0, s0 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, s1, s6, v1 +; GFX11-NEXT: v_bfi_b32 v0, s0, s6, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext @@ -1797,6 +2248,21 @@ ; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v8f16_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, s0, 16, v1 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext @@ -1863,6 +2329,19 @@ ; CI-NEXT: v_bfi_b32 v3, s0, v6, v3 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v8i16_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, <8 x i16> addrspace(1)* %in, i64 %tid.ext @@ -2047,6 +2526,55 @@ ; CI-NEXT: v_or_b32_e32 v0, v0, v6 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm +; +; GFX11-LABEL: v_insertelement_v8f16_dynamic: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7] +; GFX11-NEXT: s_cmp_eq_u32 s1, 7 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 6 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 5 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 4 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 3 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s0, s3 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-NEXT: s_cselect_b32 s6, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s0, s3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s0, s6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v0, v8, 16, v0 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x half>, <8 x half> addrspace(1)* %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11 define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: @@ -50,6 +51,18 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -98,6 +111,18 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s4, s4 ; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -111,6 +136,10 @@ ; GFX10-LABEL: s_cvt_pkrtz_undef_undef: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_cvt_pkrtz_undef_undef: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) store <2 x half> %result, <2 x half> addrspace(1)* %out ret void @@ -188,6 +217,22 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -255,6 +300,18 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -320,6 +377,18 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -402,6 +471,22 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -487,6 +572,22 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -572,6 +673,22 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext @@ -658,6 +775,22 @@ ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; GCN-LABEL: {{^}}gws_sema_p_offset0: ; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll @@ -9,6 +9,8 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOLOOP %s ; GFX6ERR-SDAG: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.ds.gws.sema.release.all ; GFX6ERR-GISEL: LLVM ERROR: cannot select: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.ds.gws.sema.release.all), %{{[0-9]+}}:sgpr(s32) :: (store (s32) into custom "GWSResource") (in function: gws_sema_release_all_offset0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,LOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s ; GCN-LABEL: {{^}}gws_sema_v_offset0: ; NOLOOP-DAG: s_mov_b32 m0, 0{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -4,9 +4,11 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MADMACF32,GFX101 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOMADMACF32,GFX103 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s ; GCN-LABEL: {{^}}test_mul_legacy_f32: ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}} +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) store float %result, float addrspace(1)* %out, align 4 @@ -15,6 +17,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32: ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}} +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a) store float %result, float addrspace(1)* %out, align 4 @@ -23,6 +26,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32: ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}} +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 { %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef) store float %result, float addrspace(1)* %out, align 4 @@ -31,6 +35,7 @@ ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32: ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, |s{{[0-9]+}}|, |{{[sv][0-9]+}}| +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, |s{{[0-9]+}}| define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) %b.fabs = call float @llvm.fabs.f32(float %b) @@ -43,6 +48,8 @@ ; GCN-LABEL: {{^}}test_add_mul_legacy_f32: ; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}} ; GCN: v_add_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}} +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX11: v_dual_mov_b32 v{{[0-9]+}}, 0 :: v_dual_add_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_add_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 { %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) %add = fadd float %mul, %c @@ -57,6 +64,8 @@ ; GFX101: v_mac_legacy_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GFX103: v_mul_legacy_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GFX103: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX11: v_dual_mov_b32 v{{[0-9]+}}, 0 :: v_dual_add_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #2 { %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) %add = fadd float %mul, %c @@ -82,6 +91,8 @@ ; MADMACF32: v_mad_legacy_f32 v{{[0-9]+}}, -s{{[0-9]+}}, -{{[sv][0-9]+}}, v{{[0-9]+}} ; NOMADMACF32: v_mul_legacy_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} ; NOMADMACF32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +; GFX11: v_mul_dx9_zero_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} +; GFX11: v_dual_mov_b32 v{{[0-9]+}}, 0 :: v_dual_add_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_mad_legacy_fneg_f32(float addrspace(1)* %out, float %a, float %b, float %c) #2 { %a.fneg = fneg float %a %b.fneg = fneg float %b diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.direct.load.ll @@ -8,9 +8,12 @@ ; GFX11: lds_direct_load v{{[0-9]+}} ; GFX11: s_mov_b32 m0 ; GFX11: lds_direct_load v{{[0-9]+}} +; GFX11: s_waitcnt expcnt(2) ; GFX11: v_add_f32 ; GFX11: buffer_store_b32 +; GFX11: s_waitcnt expcnt(1) ; GFX11: buffer_store_b32 +; GFX11: s_waitcnt expcnt(0) ; GFX11: buffer_store_b32 ; GFX11: buffer_store_b32 ; GFX11: buffer_store_b32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.param.load.ll @@ -8,11 +8,16 @@ ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.z ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr0.w ; GFX11-DAG: lds_param_load v{{[0-9]+}}, attr1.x +; GFX11: s_waitcnt expcnt(4) ; GFX11: v_add_f32 ; GFX11: buffer_store_b32 +; GFX11: s_waitcnt expcnt(3) ; GFX11: buffer_store_b32 +; GFX11: s_waitcnt expcnt(2) ; GFX11: buffer_store_b32 +; GFX11: s_waitcnt expcnt(1) ; GFX11: buffer_store_b32 +; GFX11: s_waitcnt expcnt(0) ; GFX11: buffer_store_b32 ; GFX11: buffer_store_b32 define amdgpu_ps void @lds_param_load(<4 x i32> inreg %buf, i32 inreg %arg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -1,6 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=GFX11 %s ; GCN-LABEL: {{^}}tbuffer_load: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] @@ -11,6 +12,10 @@ ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] glc ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] slc ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] glc dlc +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] glc +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] slc +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] glc dlc ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -31,6 +36,7 @@ ; GCN-LABEL: {{^}}tbuffer_load_immoffs: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offset:42 +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 42, i32 0, i32 78, i32 0) @@ -45,6 +51,9 @@ ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] offset:73 ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] offset:1 +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_8_8_8_8_SINT] offset:4095 +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_SINT] offset:73 +; GFX11-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:77 offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) @@ -62,6 +71,7 @@ ; GCN-LABEL: {{^}}tbuffer_load_ofs: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offen +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 %voffs, i32 0, i32 78, i32 0) @@ -83,6 +93,7 @@ ; GCN-LABEL: {{^}}buffer_load_xy: ; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] ; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] +; GFX11: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:77 define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -92,6 +103,7 @@ ; GCN-LABEL: {{^}}buffer_load_x: ; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] ; GFX10: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] +; GFX11: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:77 define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.raw.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -1,6 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX11 %s ; GCN-LABEL: {{^}}tbuffer_store: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED] @@ -11,6 +12,10 @@ ; GFX10: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] glc ; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc ; GFX10: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc +; GFX11: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_8_8_8_8_USCALED] +; GFX11: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc +; GFX11: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc +; GFX11: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -26,6 +31,7 @@ ; GCN-LABEL: {{^}}tbuffer_store_immoffs: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 ; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 +; GFX11: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -36,6 +42,7 @@ ; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42 ; GFX10: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:117 offset:42 +; GFX11: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:117 offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -46,6 +53,7 @@ ; GCN-LABEL: {{^}}buffer_store_ofs: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen ; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen +; GFX11: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -56,6 +64,7 @@ ; GCN-LABEL: {{^}}buffer_store_x1: ; PREGFX10: tbuffer_store_format_x v0, off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] ; GFX10: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 +; GFX11: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data) { main_body: %data.i = bitcast float %data to i32 @@ -66,6 +75,7 @@ ; GCN-LABEL: {{^}}buffer_store_x2: ; PREGFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] ; GFX10: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] +; GFX11: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll @@ -1,4 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERR %s + +; ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.s.memrealtime declare i64 @llvm.amdgcn.s.memrealtime() #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setreg.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX789 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s ; FIXME: This copy of the test is a subset of the -global-isel version, since the VGPR case doesn't work. @@ -33,6 +34,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f32_round_mode_rtz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 ; encoding: [0x01,0x08,0x80,0xb9,0x03,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 3) call void asm sideeffect "", ""() ret void @@ -60,6 +68,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f64_round_mode_rtz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; encoding: [0x81,0x08,0x80,0xb9,0x03,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2177, i32 3) call void asm sideeffect "", ""() ret void @@ -87,6 +102,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_all_round_mode_rtz: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 7 ; encoding: [0x81,0x18,0x80,0xb9,0x07,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 7) call void asm sideeffect "", ""() ret void @@ -114,6 +136,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_roundingmode_var: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 2), s0 ; encoding: [0x01,0x08,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2049, i32 %var.mode) call void asm sideeffect "", ""() ret void @@ -140,6 +169,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_ieee_mode_off: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 0 ; encoding: [0x41,0x02,0x80,0xb9,0x00,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 0) call void asm sideeffect "", ""() ret void @@ -166,6 +202,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_ieee_mode_on: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 9, 1), 1 ; encoding: [0x41,0x02,0x80,0xb9,0x01,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 577, i32 1) call void asm sideeffect "", ""() ret void @@ -192,6 +235,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_dx10_clamp_off: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 0 ; encoding: [0x01,0x02,0x80,0xb9,0x00,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 0) call void asm sideeffect "", ""() ret void @@ -218,6 +268,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_dx10_clamp_on: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 8, 1), 1 ; encoding: [0x01,0x02,0x80,0xb9,0x01,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 513, i32 1) call void asm sideeffect "", ""() ret void @@ -245,6 +302,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 8), s0 ; encoding: [0x01,0x38,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 inreg %mode) call void asm sideeffect "", ""() ret void @@ -272,6 +336,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_most_both_round_mode_and_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 7), 6 ; encoding: [0x01,0x30,0x80,0xb9,0x06,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 12289, i32 6) call void asm sideeffect "", ""() ret void @@ -299,6 +370,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_most_both_round_mode_and_denorm_mode_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 1, 3), 6 ; encoding: [0x41,0x10,0x80,0xb9,0x06,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 4161, i32 6) call void asm sideeffect "", ""() ret void @@ -325,6 +403,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f32_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0 ; encoding: [0x01,0x09,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2305, i32 %val) call void asm sideeffect "", ""() ret void @@ -351,6 +436,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_f64_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 6, 2), s0 ; encoding: [0x81,0x09,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 2433, i32 %val) call void asm sideeffect "", ""() ret void @@ -377,6 +469,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0 ; encoding: [0x01,0x18,0x00,0xb9] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 %val) call void asm sideeffect "", ""() ret void @@ -403,6 +502,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 0) call void asm sideeffect "", ""() ret void @@ -429,6 +535,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 1) call void asm sideeffect "", ""() ret void @@ -455,6 +568,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 2) call void asm sideeffect "", ""() ret void @@ -481,6 +601,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 4) call void asm sideeffect "", ""() ret void @@ -507,6 +634,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 8) call void asm sideeffect "", ""() ret void @@ -533,6 +667,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 15) call void asm sideeffect "", ""() ret void @@ -560,6 +701,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_round_mode_42: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xa ; encoding: [0x0a,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6145, i32 42) call void asm sideeffect "", ""() ret void @@ -586,6 +734,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 0) call void asm sideeffect "", ""() ret void @@ -612,6 +767,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 1) call void asm sideeffect "", ""() ret void @@ -639,6 +801,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 2) call void asm sideeffect "", ""() ret void @@ -665,6 +834,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 4) call void asm sideeffect "", ""() ret void @@ -691,6 +867,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 8) call void asm sideeffect "", ""() ret void @@ -717,6 +900,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 15) call void asm sideeffect "", ""() ret void @@ -743,6 +933,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_denorm_mode_42: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_denorm_mode 10 ; encoding: [0x0a,0x00,0x92,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6401, i32 42) call void asm sideeffect "", ""() ret void @@ -771,6 +968,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 0) call void asm sideeffect "", ""() ret void @@ -798,6 +1003,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x1 ; encoding: [0x01,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 1) call void asm sideeffect "", ""() ret void @@ -825,6 +1038,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x2 ; encoding: [0x02,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 2) call void asm sideeffect "", ""() ret void @@ -852,6 +1073,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x4 ; encoding: [0x04,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 4) call void asm sideeffect "", ""() ret void @@ -879,6 +1108,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x8 ; encoding: [0x08,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 8) call void asm sideeffect "", ""() ret void @@ -906,6 +1143,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 1 ; encoding: [0x01,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 16) call void asm sideeffect "", ""() ret void @@ -933,6 +1178,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 2 ; encoding: [0x02,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 32) call void asm sideeffect "", ""() ret void @@ -960,6 +1213,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 4 ; encoding: [0x04,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 64) call void asm sideeffect "", ""() ret void @@ -987,6 +1248,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x0 ; encoding: [0x00,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 8 ; encoding: [0x08,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 128) call void asm sideeffect "", ""() ret void @@ -1014,6 +1283,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_15: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 0 ; encoding: [0x00,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 15) call void asm sideeffect "", ""() ret void @@ -1041,6 +1318,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_255: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0xf ; encoding: [0x0f,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 15 ; encoding: [0x0f,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 255) call void asm sideeffect "", ""() ret void @@ -1069,6 +1354,14 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_denorm_mode 5 ; encoding: [0x05,0x00,0xa5,0xbf] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_full_both_round_mode_and_denorm_mode_597: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_round_mode 0x5 ; encoding: [0x05,0x00,0x91,0xbf] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_denorm_mode 5 ; encoding: [0x05,0x00,0x92,0xbf] +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14337, i32 597) call void asm sideeffect "", ""() ret void @@ -1095,6 +1388,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_set_8_bits_straddles_round_and_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 8), 0xff ; encoding: [0x81,0x38,0x80,0xb9,0xff,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 14465, i32 255) call void asm sideeffect "", ""() ret void @@ -1121,6 +1421,13 @@ ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] +; +; GFX11-LABEL: test_setreg_set_4_bits_straddles_round_and_denorm: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 4), 15 ; encoding: [0x81,0x18,0x80,0xb9,0x0f,0x00,0x00,0x00] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] call void @llvm.amdgcn.s.setreg(i32 6273, i32 15) call void asm sideeffect "", ""() ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -1,6 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX11 %s ; GCN-LABEL: {{^}}tbuffer_load: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 @@ -12,6 +13,10 @@ ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] idxen glc ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen slc ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen glc dlc +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen glc +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen slc +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen glc dlc ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: @@ -33,6 +38,7 @@ ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:42 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offset:42 +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offset:42 define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 78, i32 0) @@ -48,6 +54,9 @@ ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] idxen offset:4095 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] idxen offset:73 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:1 +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_8_8_8_8_SINT] idxen offset:4095 +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_SINT] idxen offset:73 +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:77 idxen offset:1 ; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) @@ -65,6 +74,7 @@ ; GCN-LABEL: {{^}}tbuffer_load_idx: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 78, i32 0) @@ -75,6 +85,7 @@ ; GCN-LABEL: {{^}}tbuffer_load_ofs: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 78, i32 0) @@ -85,6 +96,7 @@ ; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen offset:52 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen offset:52 +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen offset:52 define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { main_body: %ofs = add i32 %voffs, 52 @@ -96,6 +108,7 @@ ; GCN-LABEL: {{^}}tbuffer_load_both: ; PREGFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen +; GFX11: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:78 idxen offen define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 78, i32 0) @@ -107,6 +120,7 @@ ; GCN-LABEL: {{^}}buffer_load_xy: ; PREGFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen ; GFX10: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen +; GFX11: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:77 idxen define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { %vdata = call <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast <2 x i32> %vdata to <2 x float> @@ -116,6 +130,7 @@ ; GCN-LABEL: {{^}}buffer_load_x: ; PREGFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen ; GFX10: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen +; GFX11: tbuffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:77 idxen define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { %vdata = call i32 @llvm.amdgcn.struct.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 77, i32 0) %vdata.f = bitcast i32 %vdata to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -1,6 +1,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE,PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GCN,PREGFX10 %s ;RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX10 %s +;RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GCN,GFX11 %s ; GCN-LABEL: {{^}}tbuffer_store: ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 @@ -12,6 +13,10 @@ ; GFX10: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen glc ; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen slc ; GFX10: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen glc dlc +; GFX11: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_8_8_8_8_USCALED] idxen +; GFX11: tbuffer_store_format_xyzw v[4:7], [[ZEROREG]], s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc +; GFX11: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen slc +; GFX11: tbuffer_store_format_xyzw v[8:11], [[ZEROREG]], s[0:3], 0 format:78 idxen glc dlc define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -28,6 +33,7 @@ ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 ; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 ; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:117 idxen offset:42 +; GFX11: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], 0 format:117 idxen offset:42 define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: %in1 = bitcast <4 x float> %1 to <4 x i32> @@ -39,6 +45,7 @@ ; GCN: v_mov_b32_e32 [[ZEROREG:v[0-9]+]], 0 ; PREGFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] idxen offset:42 ; GFX10: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:117 idxen offset:42 +; GFX11: tbuffer_store_format_xyzw v[0:3], [[ZEROREG]], s[0:3], {{s[0-9]+}} format:117 idxen offset:42 define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -49,6 +56,7 @@ ; GCN-LABEL: {{^}}buffer_store_idx: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen ; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SSCALED] idxen +; GFX11: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -59,6 +67,7 @@ ; GCN-LABEL: {{^}}buffer_store_ofs: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] idxen offen ; GFX10: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:115 idxen offen +; GFX11: tbuffer_store_format_xyzw v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 format:115 idxen offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -69,6 +78,7 @@ ; GCN-LABEL: {{^}}buffer_store_both: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen ; GFX10: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_16_16_16_16_SINT] idxen offen +; GFX11: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -81,11 +91,13 @@ ; GCN-LABEL: {{^}}buffer_store_wait: ; PREGFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen ; GFX10: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_SINT] idxen +; GFX11: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen ; VERDE: s_waitcnt expcnt(0) ; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GCN: s_waitcnt vmcnt(0) ; PREGFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen ; GFX10: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_10_10_10_2_USCALED] idxen +; GFX11: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { main_body: %in1 = bitcast <4 x float> %vdata to <4 x i32> @@ -99,6 +111,7 @@ ; GCN-LABEL: {{^}}buffer_store_x1: ; PREGFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen ; GFX10: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen +; GFX11: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { main_body: %data.i = bitcast float %data to i32 @@ -109,6 +122,7 @@ ; GCN-LABEL: {{^}}buffer_store_x2: ; PREGFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen ; GFX10: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen +; GFX11: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { main_body: %data.i = bitcast <2 x float> %data to <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -3,6 +3,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define amdgpu_kernel void @cos_f16(half addrspace(1)* %r, half addrspace(1)* %a) { ; GFX6-LABEL: cos_f16: @@ -66,6 +67,20 @@ ; GFX10-NEXT: v_cos_f16_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: cos_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cos_f16_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.cos.f16(half %a.val) store half %r.val, half addrspace(1)* %r @@ -155,6 +170,26 @@ ; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: cos_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX11-NEXT: v_cos_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cos_f16_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) store <2 x half> %r.val, <2 x half> addrspace(1)* %r diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -113,6 +114,34 @@ ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -202,6 +231,27 @@ ; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: @@ -289,6 +339,27 @@ ; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) #0 { entry: @@ -386,6 +457,25 @@ ; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_v2f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) #0 { @@ -464,6 +554,21 @@ ; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) #0 { entry: @@ -540,6 +645,21 @@ ; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { entry: @@ -657,6 +777,30 @@ ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_v3f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_max_f16 v1, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, <3 x half> addrspace(1)* %b) #0 { @@ -789,6 +933,28 @@ ; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: maxnum_v4f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_max_f16 v1, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, <4 x half> addrspace(1)* %b) #0 { @@ -894,6 +1060,23 @@ ; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fmax_v4f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0 +; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %b) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -113,6 +114,33 @@ ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_f16_ieee: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_mov_b32 s10, -1 +; GFX11-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-NEXT: s_mov_b32 s14, s10 +; GFX11-NEXT: s_mov_b32 s15, s11 +; GFX11-NEXT: s_mov_b32 s2, s10 +; GFX11-NEXT: s_mov_b32 s3, s11 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s12, s6 +; GFX11-NEXT: s_mov_b32 s13, s7 +; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s5 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { @@ -144,10 +172,10 @@ ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: minnum_f16_no_ieee: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: minnum_f16_no_ieee: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog %r.val = call half @llvm.minnum.f16(half %a, half %b) ret half %r.val } @@ -230,6 +258,26 @@ ; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: @@ -317,6 +365,26 @@ ; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s6 +; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s8, s2 +; GFX11-NEXT: s_mov_b32 s9, s3 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a) #0 { entry: @@ -414,6 +482,24 @@ ; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_v2f16_ieee: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b) #0 { @@ -452,10 +538,10 @@ ; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: minnum_v2f16_no_ieee: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: minnum_v2f16_no_ieee: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX10PLUS-NEXT: ; return to shader part epilog %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r.val } @@ -527,6 +613,20 @@ ; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_v2f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %b) #0 { entry: @@ -603,6 +703,20 @@ ; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_v2f16_imm_b: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) #0 { entry: @@ -720,6 +834,29 @@ ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_v3f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 +; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_min_f16 v1, v2, v1 +; GFX11-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, <3 x half> addrspace(1)* %b) #0 { @@ -852,6 +989,27 @@ ; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: minnum_v4f16: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 +; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 +; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, <4 x half> addrspace(1)* %b) #0 { @@ -957,6 +1115,22 @@ ; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fmin_v4f16_imm_a: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0 +; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %b) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -3,6 +3,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define amdgpu_kernel void @sin_f16(half addrspace(1)* %r, half addrspace(1)* %a) { ; GFX6-LABEL: sin_f16: @@ -66,6 +67,20 @@ ; GFX10-NEXT: v_sin_f16_e32 v1, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sin_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sin_f16_e32 v1, v1 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.sin.f16(half %a.val) store half %r.val, half addrspace(1)* %r @@ -155,6 +170,26 @@ ; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sin_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX11-NEXT: v_sin_f16_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sin_f16_e32 v2, v2 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) store <2 x half> %r.val, <2 x half> addrspace(1)* %r diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { ; GFX9-LABEL: load_lds_v4i32: @@ -38,6 +39,14 @@ ; GFX10-NEXT: ds_read_b128 v[0:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b128 v[0:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr ret <4 x i32> %load } @@ -253,6 +262,49 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u8 v1, v0 +; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 +; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 +; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 +; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 +; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 +; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 +; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 +; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 +; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 +; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 +; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 +; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 +; GFX11-NEXT: ds_load_u8 v15, v0 offset:14 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:15 +; GFX11-NEXT: s_waitcnt lgkmcnt(14) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(12) +; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(10) +; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(8) +; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX11-NEXT: s_waitcnt lgkmcnt(6) +; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshl_or_b32 v6, v12, 8, v11 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v7, v14, 8, v13 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v8, v0, 8, v15 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 ret <4 x i32> %load } @@ -363,6 +415,28 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16 v1, v0 +; GFX11-NEXT: ds_load_u16 v2, v0 offset:2 +; GFX11-NEXT: ds_load_u16 v3, v0 offset:4 +; GFX11-NEXT: ds_load_u16 v4, v0 offset:6 +; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 +; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 +; GFX11-NEXT: ds_load_u16 v7, v0 offset:12 +; GFX11-NEXT: ds_load_u16 v8, v0 offset:14 +; GFX11-NEXT: s_waitcnt lgkmcnt(6) +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 ret <4 x i32> %load } @@ -410,6 +484,16 @@ ; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 ret <4 x i32> %load } @@ -448,6 +532,14 @@ ; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_2addr_b64 v[0:3], v0 offset1:1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 ret <4 x i32> %load } @@ -486,6 +578,14 @@ ; GFX10-NEXT: ds_read_b128 v[0:3], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v4i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b128 v[0:3], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 ret <4 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) { ; GFX9-LABEL: load_lds_v3i32: @@ -38,6 +39,14 @@ ; GFX10-NEXT: ds_read_b96 v[0:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b96 v[0:2], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr ret <3 x i32> %load } @@ -209,6 +218,40 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u8 v1, v0 +; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 +; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 +; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 +; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 +; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 +; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 +; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 +; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 +; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 +; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 +; GFX11-NEXT: ds_load_u8 v0, v0 offset:11 +; GFX11-NEXT: s_waitcnt lgkmcnt(10) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(8) +; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(6) +; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v6, v0, 8, v11 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 ret <3 x i32> %load } @@ -298,6 +341,24 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_u16 v1, v0 +; GFX11-NEXT: ds_load_u16 v2, v0 offset:2 +; GFX11-NEXT: ds_load_u16 v3, v0 offset:4 +; GFX11-NEXT: ds_load_u16 v4, v0 offset:6 +; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 +; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 +; GFX11-NEXT: s_waitcnt lgkmcnt(4) +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(2) +; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2 ret <3 x i32> %load } @@ -343,6 +404,16 @@ ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 +; GFX11-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 ret <3 x i32> %load } @@ -387,6 +458,16 @@ ; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: ds_load_b64 v[0:1], v0 +; GFX11-NEXT: ds_load_b32 v2, v2 offset:8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 ret <3 x i32> %load } @@ -425,6 +506,14 @@ ; GFX10-NEXT: ds_read_b96 v[0:2], v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: load_lds_v3i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_load_b96 v[0:2], v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 ret <3 x i32> %load } diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: @@ -62,6 +63,18 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_lshr_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = lshr <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -130,6 +143,18 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_lshr_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -211,6 +236,19 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s0, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lshr_v_s_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -290,6 +328,19 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s0 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lshr_s_v_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -362,6 +413,18 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lshr_imm_v_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -430,6 +493,18 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lshr_v_imm_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -515,6 +590,19 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_lshr_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -592,6 +680,19 @@ ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: lshr_v_imm_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/mcp-overlap-after-propagation.mir b/llvm/test/CodeGen/AMDGPU/mcp-overlap-after-propagation.mir --- a/llvm/test/CodeGen/AMDGPU/mcp-overlap-after-propagation.mir +++ b/llvm/test/CodeGen/AMDGPU/mcp-overlap-after-propagation.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s # # The MachineCopyPropagation bug being tested propagates s[60:67] into the copy # into s[56:63], and then uses s[60:67] in the following diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll @@ -1,6 +1,7 @@ ; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck %s ; CHECK: error: :0:0: in function invalid_fence void (): Unsupported atomic synchronization scope define amdgpu_kernel void @invalid_fence() { diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s ; Test splitting flat instruction offsets into the low and high bits ; when the offset doesn't fit in the offset field. @@ -22,6 +23,14 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 1 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -44,6 +53,14 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 2047 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -66,6 +83,14 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 4095 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -90,6 +115,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8191 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -114,6 +149,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -2048 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -138,6 +183,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -4096 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -162,6 +217,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -8192 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -184,6 +249,14 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 4095 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -208,6 +281,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8191 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -232,6 +315,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 16383 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -256,6 +349,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -4096 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -280,6 +383,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -8192 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -304,6 +417,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -16384 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -329,6 +452,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589936639 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -354,6 +487,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589936640 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -379,6 +522,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589938687 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -404,6 +557,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589938688 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -429,6 +592,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589942783 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -454,6 +627,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589942784 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -480,6 +663,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -506,6 +699,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -532,6 +735,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -558,6 +771,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -584,6 +807,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -610,6 +843,16 @@ ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 %load = load i8, i8* %gep, align 4 ret i8 %load @@ -639,6 +882,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 1 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -669,6 +923,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 2047 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -699,6 +964,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 4095 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -731,6 +1007,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8191 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -763,6 +1052,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -2048 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -795,6 +1097,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -4096 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -827,6 +1142,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -8192 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -857,6 +1185,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 4095 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -889,6 +1228,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_2x_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8191 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -921,6 +1273,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_2x_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x3000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 16383 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -953,6 +1318,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_2x_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -4096 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -985,6 +1363,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_2x_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -8192 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1017,6 +1408,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_2x_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -16384 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1049,6 +1453,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_11bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589936639 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1081,6 +1498,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_11bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2048 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589936640 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1113,6 +1543,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_12bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589938687 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1146,6 +1589,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_12bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589938688 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1179,6 +1635,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_13bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589942783 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1212,6 +1681,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_13bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 8589942784 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1246,6 +1728,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1280,6 +1776,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1314,6 +1824,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1348,6 +1872,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1382,6 +1920,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef @@ -1416,6 +1968,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_store_byte v[0:1], v0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: flat_load_u8 v0, v[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: flat_store_b8 v[0:1], v0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 %load = load volatile i8, i8* %gep, align 1 store i8 %load, i8* undef diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s ; Test splitting flat instruction offsets into the low and high bits ; when the offset doesn't fit in the offset field. @@ -20,6 +21,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -40,6 +49,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2047 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -62,6 +79,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -86,6 +111,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -106,6 +141,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -128,6 +171,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -152,6 +203,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -174,6 +235,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_2x_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -198,6 +267,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_2x_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -222,6 +301,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_2x_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -244,6 +333,14 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-4096 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -268,6 +365,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -292,6 +399,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -317,6 +434,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_11bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2047 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -342,6 +469,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_11bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -367,6 +504,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -392,6 +539,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -417,6 +574,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -442,6 +609,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -468,6 +645,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2049 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -494,6 +681,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-2048 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -520,6 +717,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -546,6 +753,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -572,6 +789,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:-1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -598,6 +825,16 @@ ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo +; GFX11-NEXT: global_load_u8 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 %load = load i8, i8 addrspace(1)* %gep, align 4 ret i8 %load @@ -623,6 +860,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -649,6 +897,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -675,6 +934,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -701,6 +971,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -727,6 +1008,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -754,6 +1046,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -783,6 +1086,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -809,6 +1125,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -835,6 +1162,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -861,6 +1199,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -888,6 +1237,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -917,6 +1277,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_2x_neg_12bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -946,6 +1319,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_2x_neg_13bit_max: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -976,6 +1362,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1006,6 +1405,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1036,6 +1448,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1066,6 +1491,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1096,6 +1534,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x1000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1126,6 +1577,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s0, 0x2000, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0 +; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1157,6 +1621,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff +; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1188,6 +1665,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, 0x800 +; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1219,6 +1709,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, 0xfff +; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1250,6 +1753,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, 0x1000 +; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1281,6 +1797,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff +; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef @@ -1312,6 +1841,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s0, s0, 0x2000 +; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000 +; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 %load = load volatile i8, i8 addrspace(1)* %gep, align 1 store i8 %load, i8 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/or3.ll b/llvm/test/CodeGen/AMDGPU/or3.ll --- a/llvm/test/CodeGen/AMDGPU/or3.ll +++ b/llvm/test/CodeGen/AMDGPU/or3.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_OR3_B32 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX900 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s declare i64 @_Z13get_global_idj(i32) #0 @@ -303,6 +304,78 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v13, v3, vcc ; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: clmem_read_simplified: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2048 +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048 +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x3000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[14:15], v[14:15], off offset:2048 +; GFX11-NEXT: global_load_b64 v[16:17], v[0:1], off +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v7, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v13, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v10, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v14, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v15, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -817,6 +890,130 @@ ; GFX90A-NEXT: .LBB1_5: ; %while.end ; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[4:5], off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: clmem_read: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_lshlrev_b32 v3, 17, v0 +; GFX11-NEXT: s_movk_i32 s1, 0x7f +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX11-NEXT: v_and_b32_e32 v2, 0xfe000000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v0, s34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s35, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v2 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x5000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v6, vcc_lo +; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v7, v3 :: v_dual_mov_b32 v6, v2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB1_2: ; %for.body +; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v6, 0xffffc000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xffffc000, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffd000, v6 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[14:15], v[8:9], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:-2048 +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, -1, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v6, 0xffffe000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, -1, v7, vcc_lo +; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:-2048 +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, 0xffffe000, v6 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[20:21], v[16:17], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off +; GFX11-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v7, vcc_lo +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_load_b64 v[18:19], v[18:19], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[16:17], v[16:17], off +; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[24:25], v[6:7], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[26:27], v[6:7], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[28:29], v[6:7], off +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x10000, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX11-NEXT: s_addk_i32 s2, 0x2000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: v_add_co_u32 v4, s0, v14, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v15, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v10, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v11, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v8, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v9, v5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v12, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v13, v5, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v20, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v21, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v18, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v19, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v16, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v17, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v22, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v23, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v24, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v25, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v4, s0, v26, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s0, v27, v5, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v28, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v29, v5, vcc_lo +; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 +; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit +; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1 +; GFX11-NEXT: s_add_i32 s0, s1, -1 +; GFX11-NEXT: s_cmp_eq_u32 s1, 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB1_5 +; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_branch .LBB1_1 +; GFX11-NEXT: .LBB1_5: ; %while.end +; GFX11-NEXT: global_store_b64 v[0:1], v[4:5], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -1185,6 +1382,63 @@ ; GFX90A-NEXT: v_add3_u32 v2, v4, v2, v5 ; GFX90A-NEXT: global_store_dword v[0:1], v2, off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: Address32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v8, v[0:1], off +; GFX11-NEXT: global_load_b32 v9, v[0:1], off offset:1024 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0x2000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_load_b32 v10, v[0:1], off offset:2048 +; GFX11-NEXT: global_load_b32 v11, v[0:1], off offset:3072 +; GFX11-NEXT: global_load_b32 v12, v[6:7], off offset:-4096 +; GFX11-NEXT: global_load_b32 v13, v[4:5], off offset:1024 +; GFX11-NEXT: global_load_b32 v14, v[4:5], off offset:2048 +; GFX11-NEXT: global_load_b32 v4, v[4:5], off offset:3072 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v5, v[6:7], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:1024 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u32_e32 v1, v9, v8 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v10, v1, v11 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add3_u32 v1, v12, v1, v13 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v14, v1, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add3_u32 v0, v5, v1, v0 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -1441,6 +1695,52 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc ; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: Offset64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, 0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 1, v1, vcc_lo +; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[8:9], v[6:7], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[6:7], v[6:7], off +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v8, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v1, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -1657,6 +1957,47 @@ ; GFX90A-NEXT: v_add3_u32 v2, v10, v2, v11 ; GFX90A-NEXT: global_store_dword v[0:1], v2, off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: p32Offset64: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7ffff000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x80000000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[4:5], off offset:2048 +; GFX11-NEXT: global_load_b32 v4, v[4:5], off offset:3072 +; GFX11-NEXT: global_load_b32 v5, v[6:7], off +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, v4, v0, v5 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -1873,6 +2214,60 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: DiffBase: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b128 s[36:39], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v0 +; GFX11-NEXT: v_add_co_u32 v0, s0, s36, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s37, 0, s0 +; GFX11-NEXT: v_add_co_u32 v10, s0, s38, v2 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x1000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, s39, 0, s0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x2000 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, 0x2000, v10 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x3000, v10 +; GFX11-NEXT: global_load_b64 v[8:9], v[4:5], off offset:-4096 +; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v11, vcc_lo +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:2048 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[6:7], v[6:7], off offset:2048 +; GFX11-NEXT: global_load_b64 v[12:13], v[10:11], off +; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off +; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v8 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v9, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v12, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v13, v7, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v10, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm i8 addrspace(1)* %buffer2) { entry: %call = tail call i64 @_Z13get_global_idj(i32 0) @@ -2210,6 +2605,75 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v23, v3, vcc ; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: ReverseOrder: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0x2000, v0 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off +; GFX11-NEXT: global_load_b64 v[8:9], v[4:5], off offset:2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off +; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0x1000, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: global_load_b64 v[14:15], v[10:11], off offset:2048 +; GFX11-NEXT: global_load_b64 v[16:17], v[12:13], off +; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off +; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:2048 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v8, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v9, v7, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v14, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v15, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v10, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v11, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v13, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v17, v5, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -2424,6 +2888,43 @@ ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v9, v7, vcc ; GFX90A-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX90A-NEXT: s_endpgm +; +; GFX11-LABEL: negativeoffset: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX11-NEXT: v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[34:35], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s32, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff8000, v2 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0, v4 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[4:5], v[4:5], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %call = tail call i64 @_Z13get_global_idj(i32 0) #2 %conv = and i64 %call, 255 diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll --- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN: foo1: ; v_cndmask_b32_e64 v0, 0, 1, vcc_lo{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll --- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-clone.ll @@ -3,6 +3,7 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 --amdgpu-internalize-symbols < %s | FileCheck -check-prefixes=OPT,OPT-INT %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='default' --amdgpu-internalize-symbols < %s | FileCheck -check-prefixes=OPT,OPT-INT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s ; OPT: declare void @foo4() local_unnamed_addr #0 ; OPT: define internal fastcc void @0() unnamed_addr #1 diff --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll --- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll @@ -1,6 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes='default' < %s | FileCheck -check-prefix=OPT %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s ; OPT: declare void @foo4() local_unnamed_addr #0 ; OPT: define void @foo3() local_unnamed_addr #1 diff --git a/llvm/test/CodeGen/AMDGPU/ptrmask.ll b/llvm/test/CodeGen/AMDGPU/ptrmask.ll --- a/llvm/test/CodeGen/AMDGPU/ptrmask.ll +++ b/llvm/test/CodeGen/AMDGPU/ptrmask.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define i8 addrspace(1)* @v_ptrmask_global_variable_i64(i8 addrspace(1)* %ptr, i64 %mask) { ; GCN-LABEL: v_ptrmask_global_variable_i64: @@ -10,13 +11,13 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ptrmask_global_variable_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ptrmask_global_variable_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* %ptr, i64 %mask) ret i8 addrspace(1)* %masked } @@ -36,6 +37,13 @@ ; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ptrmask_global_variable_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)* %ptr, i32 %mask) ret i8 addrspace(1)* %masked } @@ -55,6 +63,14 @@ ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ptrmask_global_variable_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)* %ptr, i16 %mask) ret i8 addrspace(1)* %masked } @@ -66,12 +82,12 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ptrmask_local_variable_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ptrmask_local_variable_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask) ret i8 addrspace(3)* %masked } @@ -83,12 +99,12 @@ ; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ptrmask_local_variable_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ptrmask_local_variable_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask) ret i8 addrspace(3)* %masked } @@ -106,6 +122,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ptrmask_local_variable_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask) ret i8 addrspace(3)* %masked } @@ -116,10 +140,10 @@ ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ptrmask_global_variable_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ptrmask_global_variable_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10PLUS-NEXT: ; return to shader part epilog %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i64(i8 addrspace(1)* %ptr, i64 %mask) ret i8 addrspace(1)* %masked } @@ -132,12 +156,12 @@ ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ptrmask_global_variable_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, 0 -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ptrmask_global_variable_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s5, 0 +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[4:5] +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i32(i8 addrspace(1)* %ptr, i32 %mask) ret i8 addrspace(1)* %masked } @@ -151,13 +175,13 @@ ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ptrmask_global_variable_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_and_b32 s0, s4, 0xffff -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ptrmask_global_variable_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10PLUS-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10PLUS-NEXT: s_mov_b32 s1, 0 +; GFX10PLUS-NEXT: ; return to shader part epilog %masked = call i8 addrspace(1)* @llvm.ptrmask.p1i8.i16(i8 addrspace(1)* %ptr, i16 %mask) ret i8 addrspace(1)* %masked } @@ -168,10 +192,10 @@ ; GCN-NEXT: s_and_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ptrmask_local_variable_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ptrmask_local_variable_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i64(i8 addrspace(3)* %ptr, i64 %mask) ret i8 addrspace(3)* %masked } @@ -182,10 +206,10 @@ ; GCN-NEXT: s_and_b32 s0, s2, s3 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ptrmask_local_variable_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ptrmask_local_variable_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i32(i8 addrspace(3)* %ptr, i32 %mask) ret i8 addrspace(3)* %masked } @@ -197,11 +221,11 @@ ; GCN-NEXT: s_and_b32 s0, s2, s0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_ptrmask_local_variable_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, 0xffff, s3 -; GFX10-NEXT: s_and_b32 s0, s2, s0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_ptrmask_local_variable_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s3 +; GFX10PLUS-NEXT: s_and_b32 s0, s2, s0 +; GFX10PLUS-NEXT: ; return to shader part epilog %masked = call i8 addrspace(3)* @llvm.ptrmask.p3i8.i16(i8 addrspace(3)* %ptr, i16 %mask) ret i8 addrspace(3)* %masked } diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll --- a/llvm/test/CodeGen/AMDGPU/rel32.ll +++ b/llvm/test/CodeGen/AMDGPU/rel32.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s @g = protected local_unnamed_addr addrspace(4) externally_initialized global i32 0, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone @@ -93,6 +94,27 @@ ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: saddo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s2, s6, s0 +; GFX11-NEXT: s_addc_u32 s3, s7, s1 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -181,6 +203,24 @@ ; GFX10-NEXT: global_store_dword v1, v2, s[4:5] ; GFX10-NEXT: global_store_byte v1, v0, s[6:7] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_saddo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp +; GFX11-NEXT: s_add_i32 s4, s4, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %sadd, 0 %carry = extractvalue { i32, i1 } %sadd, 1 @@ -276,6 +316,26 @@ ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: global_store_byte v0, v2, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_saddo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_i32 v3, v1, v2 clamp +; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -371,6 +431,25 @@ ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: global_store_byte v2, v3, s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_saddo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s8, s4, s6 +; GFX11-NEXT: s_addc_u32 s9, s5, s7 +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s4, s6, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 %carry = extractvalue { i64, i1 } %sadd, 1 @@ -472,6 +551,29 @@ ; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] ; GFX10-NEXT: global_store_byte v6, v0, s[6:7] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_saddo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] +; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load i64, i64 addrspace(1)* %aptr, align 4 %b = load i64, i64 addrspace(1)* %bptr, align 4 %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -587,6 +689,30 @@ ; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_saddo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v4, v1, v3 +; GFX11-NEXT: v_add_nc_i32 v1, v1, v3 clamp +; GFX11-NEXT: v_add_nc_u32_e32 v3, v0, v2 +; GFX11-NEXT: v_add_nc_i32 v0, v0, v2 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4 %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.sadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_saddsat_i8: @@ -32,15 +33,15 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -74,12 +75,12 @@ ; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -115,12 +116,12 @@ ; GFX9-NEXT: v_add_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_nc_i32 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -174,12 +175,12 @@ ; GFX9-NEXT: v_pk_add_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -247,13 +248,13 @@ ; GFX9-NEXT: v_pk_add_i16 v0, v0, v2 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_v3i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp -; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.sadd.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -338,13 +339,13 @@ ; GFX9-NEXT: v_pk_add_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_i16 v0, v0, v2 clamp -; GFX10-NEXT: v_pk_add_i16 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_i16 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_pk_add_i16 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -396,13 +397,13 @@ ; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_saddsat_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_nc_i32 v0, v0, v2 clamp -; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_saddsat_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_nc_i32 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_add_nc_i32 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -464,6 +465,20 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_saddsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: @@ -65,6 +66,19 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_shl_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs store <2 x i16> %result, <2 x i16> addrspace(1)* %out ret void @@ -133,6 +147,18 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_shl_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -214,6 +240,19 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: shl_v_s_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -293,6 +332,19 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: shl_s_v_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -366,6 +418,18 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: shl_imm_v_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -435,6 +499,18 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: shl_v_imm_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -520,6 +596,19 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_shl_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext @@ -603,6 +692,19 @@ ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: shl_v_imm_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/shl_add.ll b/llvm/test/CodeGen/AMDGPU/shl_add.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_LSHL_ADD_U32 diff --git a/llvm/test/CodeGen/AMDGPU/shl_or.ll b/llvm/test/CodeGen/AMDGPU/shl_or.ll --- a/llvm/test/CodeGen/AMDGPU/shl_or.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_or.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_LSHL_OR_B32 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -3,6 +3,7 @@ ; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 ; Test that add/sub with a constant is swapped to sub/add with negated ; constant to minimize code size. @@ -62,6 +63,18 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -151,6 +164,24 @@ ; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -219,6 +250,18 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_64_sub_x: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -284,6 +327,18 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -349,6 +404,18 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_65_sub_x: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -414,6 +481,18 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -479,6 +558,18 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_neg16_sub_x: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -544,6 +635,18 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_neg17: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 17, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -609,6 +712,18 @@ ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_neg17_sub_x: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext @@ -659,6 +774,16 @@ ; GFX10-NEXT: ; use s0 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_i32_x_sub_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_i32 s0, s0, 64 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %result = sub i32 %x, 64 call void asm sideeffect "; use $0", "s"(i32 %result) ret void @@ -719,6 +844,18 @@ ; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i16_x_sub_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext @@ -790,6 +927,21 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext @@ -880,6 +1032,24 @@ ; GFX10-NEXT: global_store_short v0, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64 +; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext @@ -954,6 +1124,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_64_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1026,6 +1208,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_7_64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1098,6 +1292,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_64_123: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1168,6 +1374,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_7_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1236,6 +1454,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_0_16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1304,6 +1534,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1372,6 +1614,18 @@ ; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1444,6 +1698,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1511,6 +1777,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_0_neg32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1580,6 +1858,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg32_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1652,6 +1942,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1719,6 +2021,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_0_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1788,6 +2102,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1860,6 +2186,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -1932,6 +2270,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -2004,6 +2354,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -2076,6 +2438,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -2143,6 +2517,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext @@ -2209,6 +2595,18 @@ ; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_v2i16_x_add_neg32_undef: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext diff --git a/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir b/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir --- a/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-instructions-implicit-vcclo.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-shrink-instructions --verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-shrink-instructions --verify-machineinstrs %s -o - | FileCheck %s # Make sure the implicit vcc_lo of V_CNDMASK is preserved and not promoted to vcc. --- diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,SI %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,WAVE64,GFX10-WAVE64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX10-WAVE32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX11 %s define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { ; GCN-LABEL: test_kill_depth_0_imm_pos: @@ -31,6 +32,16 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_depth_0_imm_neg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b64 exec, exec, exec +; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB1_1: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.kill(i1 false) ret void } @@ -66,6 +77,23 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_depth_0_imm_neg_x2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec +; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX11-NEXT: ; %bb.1: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec +; GFX11-NEXT: s_cbranch_scc0 .LBB2_2 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB2_2: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.kill(i1 false) call void @llvm.amdgcn.kill(i1 false) ret void @@ -93,6 +121,17 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_depth_var: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB3_1: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm %cmp = fcmp olt float %x, 0.0 call void @llvm.amdgcn.kill(i1 %cmp) ret void @@ -150,6 +189,24 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_depth_var_x2_same: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX11-NEXT: ; %bb.1: +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB4_2 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB4_2: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm %cmp = fcmp olt float %x, 0.0 call void @llvm.amdgcn.kill(i1 %cmp) call void @llvm.amdgcn.kill(i1 %cmp) @@ -208,6 +265,24 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_depth_var_x2: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX11-NEXT: ; %bb.1: +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB5_2 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB5_2: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm %cmp.x = fcmp olt float %x, 0.0 call void @llvm.amdgcn.kill(i1 %cmp.x) %cmp.y = fcmp olt float %y, 0.0 @@ -275,6 +350,27 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_depth_var_x2_instructions: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB6_2 +; GFX11-NEXT: ; %bb.1: +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v7, -1 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB6_2 +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB6_2: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm %cmp.x = fcmp olt float %x, 0.0 call void @llvm.amdgcn.kill(i1 %cmp.x) %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={v7}"() @@ -389,6 +485,42 @@ ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB7_5: +; +; GFX11-LABEL: test_kill_control_flow: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %exit +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_branch .LBB7_5 +; GFX11-NEXT: .LBB7_2: ; %bb +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v7, -1 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB7_4 +; GFX11-NEXT: ; %bb.3: ; %bb +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX11-NEXT: s_branch .LBB7_5 +; GFX11-NEXT: .LBB7_4: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB7_5: entry: %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %bb, label %exit @@ -548,6 +680,52 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_control_flow_remainder: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %exit +; GFX11-NEXT: global_store_b32 v[0:1], v9, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB8_2: ; %bb +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v7, -1 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v8, -1 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB8_4 +; GFX11-NEXT: ; %bb.3: ; %bb +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: global_store_b32 v[0:1], v8, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v9, -2 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: global_store_b32 v[0:1], v9, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB8_4: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm entry: %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %bb, label %exit @@ -690,6 +868,44 @@ ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm ; GFX10-WAVE32-NEXT: .LBB9_5: +; +; GFX11-LABEL: test_kill_control_flow_return: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_eq_u32 s0, 1 +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b64 s[4:5], s[4:5], exec +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 +; GFX11-NEXT: ; %bb.1: ; %entry +; GFX11-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_cbranch_scc0 .LBB9_3 +; GFX11-NEXT: ; %bb.2: ; %exit +; GFX11-NEXT: s_branch .LBB9_5 +; GFX11-NEXT: .LBB9_3: ; %bb +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v7, -1 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_mov_b32_e32 v0, v7 +; GFX11-NEXT: s_branch .LBB9_5 +; GFX11-NEXT: .LBB9_4: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB9_5: entry: %kill = icmp eq i32 %arg, 1 %cmp = icmp eq i32 %arg, 0 @@ -849,6 +1065,50 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: test_kill_divergent_loop: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11-NEXT: s_cbranch_execz .LBB10_3 +; GFX11-NEXT: .LBB10_1: ; %bb +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: v_mov_b32_e64 v7, -1 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: v_nop_e64 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v7 +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB10_4 +; GFX11-NEXT: ; %bb.2: ; %bb +; GFX11-NEXT: ; in Loop: Header=BB10_1 Depth=1 +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: global_load_b32 v0, v[0:1], off glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11-NEXT: s_cbranch_vccnz .LBB10_1 +; GFX11-NEXT: .LBB10_3: ; %Flow1 +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, 8 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB10_4: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm entry: %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %bb, label %exit @@ -971,6 +1231,38 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: phi_use_def_before_kill: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_add_f32_e64 v1, s0, 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc +; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1 +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB11_6 +; GFX11-NEXT: ; %bb.1: ; %bb +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB11_3 +; GFX11-NEXT: ; %bb.2: ; %bb8 +; GFX11-NEXT: v_mov_b32_e32 v1, 8 +; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB11_3: ; %phibb +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_cbranch_vccz .LBB11_5 +; GFX11-NEXT: ; %bb.4: ; %bb10 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB11_5: ; %end +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB11_6: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm bb: %tmp = fadd float %x, 1.000000e+00 %tmp1 = fcmp olt float 0.000000e+00, %tmp @@ -1057,6 +1349,29 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: no_skip_no_successors: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: v_cmp_nge_f32_e64 s[4:5], s1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b64 vcc, exec, s[4:5] +; GFX11-NEXT: s_cbranch_vccz .LBB12_3 +; GFX11-NEXT: ; %bb.1: ; %bb6 +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec +; GFX11-NEXT: s_cbranch_scc0 .LBB12_5 +; GFX11-NEXT: ; %bb.2: ; %bb6 +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: .LBB12_3: ; %bb3 +; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x3e7ae148, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX11-NEXT: ; %bb.4: ; %bb5 +; GFX11-NEXT: .LBB12_5: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm bb: %tmp = fcmp ult float %arg1, 0.000000e+00 %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000 @@ -1201,6 +1516,48 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: if_after_kill_block: +; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_wqm_b64 exec, exec +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 +; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX11-NEXT: s_cbranch_execz .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %bb3 +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 +; GFX11-NEXT: ; %bb.2: ; %bb3 +; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: .LBB13_3: ; %bb4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmpx_neq_f32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB13_5 +; GFX11-NEXT: ; %bb.4: ; %bb8 +; GFX11-NEXT: v_mov_b32_e32 v0, 9 +; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .LBB13_5: ; %UnifiedReturnBlock +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB13_6: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm bb: %tmp = fcmp ult float %arg1, 0.000000e+00 br i1 %tmp, label %bb3, label %bb4 @@ -1346,6 +1703,47 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: cbranch_kill: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_mov_b32 s6, s4 +; GFX11-NEXT: s_mov_b32 s7, s4 +; GFX11-NEXT: s_mov_b32 s8, s4 +; GFX11-NEXT: s_mov_b32 s9, s4 +; GFX11-NEXT: s_mov_b32 s10, s4 +; GFX11-NEXT: s_mov_b32 s11, s4 +; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1 +; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX11-NEXT: s_cbranch_execz .LBB14_3 +; GFX11-NEXT: ; %bb.1: ; %kill +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], exec +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_cbranch_scc0 .LBB14_6 +; GFX11-NEXT: ; %bb.2: ; %kill +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: .LBB14_3: ; %Flow +; GFX11-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b64 exec, exec, s[0:1] +; GFX11-NEXT: ; %bb.4: ; %live +; GFX11-NEXT: v_mul_f32_e32 v2, v0, v1 +; GFX11-NEXT: ; %bb.5: ; %export +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: exp mrt0 v2, v2, v2, v2 done +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB14_6: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm .entry: %sample = call float @llvm.amdgcn.image.sample.l.2darray.f32.f32(i32 1, float %val1, float %val1, float %val1, float 0.000000e+00, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) %cond0 = fcmp ugt float %sample, 0.000000e+00 @@ -1501,6 +1899,52 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 ; GFX10-WAVE32-NEXT: exp null off, off, off, off done vm ; GFX10-WAVE32-NEXT: s_endpgm +; +; GFX11-LABEL: complex_loop: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-NEXT: s_cbranch_scc1 .LBB15_7 +; GFX11-NEXT: ; %bb.1: ; %.lr.ph +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: s_branch .LBB15_3 +; GFX11-NEXT: .LBB15_2: ; %latch +; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11-NEXT: s_add_i32 s6, s6, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 +; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_cbranch_execz .LBB15_6 +; GFX11-NEXT: .LBB15_3: ; %hdr +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: v_cmpx_gt_u32_e64 s6, v0 +; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: ; %bb.4: ; %kill +; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], exec +; GFX11-NEXT: s_cbranch_scc0 .LBB15_8 +; GFX11-NEXT: ; %bb.5: ; %kill +; GFX11-NEXT: ; in Loop: Header=BB15_3 Depth=1 +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: s_branch .LBB15_2 +; GFX11-NEXT: .LBB15_6: ; %Flow +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: exp mrt0 v2, v2, v0, v0 done +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB15_7: +; GFX11-NEXT: v_mov_b32_e32 v2, -1 +; GFX11-NEXT: exp mrt0 v2, v2, v0, v0 done +; GFX11-NEXT: s_endpgm +; GFX11-NEXT: .LBB15_8: +; GFX11-NEXT: s_mov_b64 exec, 0 +; GFX11-NEXT: exp mrt0 off, off, off, off done +; GFX11-NEXT: s_endpgm .entry: %flaga = icmp sgt i32 %cmpa, 0 br i1 %flaga, label %.lr.ph, label %._crit_edge @@ -1567,6 +2011,19 @@ ; GFX10-WAVE32-NEXT: .LBB16_2: ; %bb.1 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-WAVE32-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: skip_mode_switch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b64 s[0:1], exec +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %bb.0 +; GFX11-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 2), 3 +; GFX11-NEXT: .LBB16_2: ; %bb.1 +; GFX11-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %bb.0, label %bb.1 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_ssubsat_i8: @@ -32,15 +33,15 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -74,12 +75,12 @@ ; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -115,12 +116,12 @@ ; GFX9-NEXT: v_sub_i32 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_i32 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -174,12 +175,12 @@ ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -247,13 +248,13 @@ ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v3i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 clamp -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.ssub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -338,13 +339,13 @@ ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 clamp -; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_sub_i16 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_pk_sub_i16 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -396,13 +397,13 @@ ; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp -; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -468,14 +469,14 @@ ; GFX9-NEXT: v_sub_i32 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_i32 v0, v0, v3 clamp -; GFX10-NEXT: v_sub_nc_i32 v1, v1, v4 clamp -; GFX10-NEXT: v_sub_nc_i32 v2, v2, v5 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v3 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v4 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v5 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.ssub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result } @@ -556,15 +557,15 @@ ; GFX9-NEXT: v_sub_i32 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_i32 v0, v0, v4 clamp -; GFX10-NEXT: v_sub_nc_i32 v1, v1, v5 clamp -; GFX10-NEXT: v_sub_nc_i32 v2, v2, v6 clamp -; GFX10-NEXT: v_sub_nc_i32 v3, v3, v7 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v4 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v5 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v6 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v3, v3, v7 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result } @@ -705,19 +706,19 @@ ; GFX9-NEXT: v_sub_i32 v7, v7, v15 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_ssubsat_v8i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_i32 v0, v0, v8 clamp -; GFX10-NEXT: v_sub_nc_i32 v1, v1, v9 clamp -; GFX10-NEXT: v_sub_nc_i32 v2, v2, v10 clamp -; GFX10-NEXT: v_sub_nc_i32 v3, v3, v11 clamp -; GFX10-NEXT: v_sub_nc_i32 v4, v4, v12 clamp -; GFX10-NEXT: v_sub_nc_i32 v5, v5, v13 clamp -; GFX10-NEXT: v_sub_nc_i32 v6, v6, v14 clamp -; GFX10-NEXT: v_sub_nc_i32 v7, v7, v15 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_ssubsat_v8i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_i32 v0, v0, v8 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v1, v1, v9 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v2, v2, v10 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v3, v3, v11 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v4, v4, v12 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v5, v5, v13 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v6, v6, v14 clamp +; GFX10PLUS-NEXT: v_sub_nc_i32 v7, v7, v15 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) ret <8 x i32> %result } @@ -1007,6 +1008,30 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_i32 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ssubsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_sub_nc_i32 v0, v0, v16 clamp +; GFX11-NEXT: v_sub_nc_i32 v1, v1, v17 clamp +; GFX11-NEXT: v_sub_nc_i32 v2, v2, v18 clamp +; GFX11-NEXT: v_sub_nc_i32 v3, v3, v19 clamp +; GFX11-NEXT: v_sub_nc_i32 v4, v4, v20 clamp +; GFX11-NEXT: v_sub_nc_i32 v5, v5, v21 clamp +; GFX11-NEXT: v_sub_nc_i32 v6, v6, v22 clamp +; GFX11-NEXT: v_sub_nc_i32 v7, v7, v23 clamp +; GFX11-NEXT: v_sub_nc_i32 v8, v8, v24 clamp +; GFX11-NEXT: v_sub_nc_i32 v9, v9, v25 clamp +; GFX11-NEXT: v_sub_nc_i32 v10, v10, v26 clamp +; GFX11-NEXT: v_sub_nc_i32 v11, v11, v27 clamp +; GFX11-NEXT: v_sub_nc_i32 v12, v12, v28 clamp +; GFX11-NEXT: v_sub_nc_i32 v13, v13, v29 clamp +; GFX11-NEXT: v_sub_nc_i32 v14, v14, v30 clamp +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_i32 v15, v15, v31 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -1069,6 +1094,20 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_ssubsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] +; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=MUBUF %s ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=MUBUF11 %s +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR11 %s ; During instruction selection, we use immediate const zero for soffset in ; MUBUF stack accesses and let eliminateFrameIndex to fix up this field to use @@ -75,6 +77,62 @@ ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: .LBB0_2: ; %shader_eval_surface.exit ; FLATSCR-NEXT: s_endpgm +; +; MUBUF11-LABEL: kernel_background_evaluate: +; MUBUF11: ; %bb.0: ; %entry +; MUBUF11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000 +; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 +; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF11-NEXT: s_movk_i32 s32, 0x6000 +; MUBUF11-NEXT: s_getpc_b64 s[0:1] +; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4 +; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12 +; MUBUF11-NEXT: s_waitcnt lgkmcnt(0) +; MUBUF11-NEXT: v_mov_b32_e32 v0, s2 +; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; MUBUF11-NEXT: s_mov_b32 s0, exec_lo +; MUBUF11-NEXT: v_cmpx_ne_u32_e32 0, v0 +; MUBUF11-NEXT: s_cbranch_execz .LBB0_2 +; MUBUF11-NEXT: ; %bb.1: ; %if.then4.i +; MUBUF11-NEXT: s_movk_i32 vcc_lo, 0x4000 +; MUBUF11-NEXT: s_mov_b32 s0, 0x41c64e6d +; MUBUF11-NEXT: scratch_load_b64 v[0:1], off, vcc_lo offset:4 +; MUBUF11-NEXT: s_waitcnt vmcnt(0) +; MUBUF11-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; MUBUF11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039 +; MUBUF11-NEXT: scratch_store_b32 off, v0, s0 +; MUBUF11-NEXT: .LBB0_2: ; %shader_eval_surface.exit +; MUBUF11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; MUBUF11-NEXT: s_endpgm +; +; FLATSCR11-LABEL: kernel_background_evaluate: +; FLATSCR11: ; %bb.0: ; %entry +; FLATSCR11-NEXT: s_load_b32 s2, s[0:1], 0x24 +; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000 +; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 +; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000 +; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000 +; FLATSCR11-NEXT: s_getpc_b64 s[0:1] +; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4 +; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12 +; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0) +; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2 +; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; FLATSCR11-NEXT: s_mov_b32 s0, exec_lo +; FLATSCR11-NEXT: v_cmpx_ne_u32_e32 0, v0 +; FLATSCR11-NEXT: s_cbranch_execz .LBB0_2 +; FLATSCR11-NEXT: ; %bb.1: ; %if.then4.i +; FLATSCR11-NEXT: s_movk_i32 vcc_lo, 0x4000 +; FLATSCR11-NEXT: s_mov_b32 s0, 0x41c64e6d +; FLATSCR11-NEXT: scratch_load_b64 v[0:1], off, vcc_lo offset:4 +; FLATSCR11-NEXT: s_waitcnt vmcnt(0) +; FLATSCR11-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; FLATSCR11-NEXT: v_mad_u64_u32 v[0:1], null, v2, s0, 0x3039 +; FLATSCR11-NEXT: scratch_store_b32 off, v0, s0 +; FLATSCR11-NEXT: .LBB0_2: ; %shader_eval_surface.exit +; FLATSCR11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; FLATSCR11-NEXT: s_endpgm entry: %sd = alloca < 1339 x i32>, align 8192, addrspace(5) %state = alloca <4 x i32>, align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define amdgpu_kernel void @store_lds_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { ; GFX9-LABEL: store_lds_v4i32: @@ -59,6 +60,18 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: ds_store_b128 v4, v[0:3] +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out ret void } @@ -258,6 +271,46 @@ ; GFX10-NEXT: ds_write_b8 v0, v2 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v3 offset:7 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: s_lshr_b32 s4, s2, 8 +; GFX11-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-NEXT: s_lshr_b32 s3, s3, 24 +; GFX11-NEXT: s_lshr_b32 s6, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s4 +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 +; GFX11-NEXT: s_lshr_b32 s7, s1, 8 +; GFX11-NEXT: v_dual_mov_b32 v8, s3 :: v_dual_mov_b32 v9, s6 +; GFX11-NEXT: v_mov_b32_e32 v10, s0 +; GFX11-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s5 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-NEXT: ds_store_b8 v0, v3 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:9 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:12 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:11 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:13 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:14 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:15 +; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: s_lshr_b32 s0, s1, 24 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: ds_store_b8 v0, v4 offset:4 +; GFX11-NEXT: ds_store_b8 v0, v10 offset:3 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:5 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v4 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v2 offset:7 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void } @@ -361,6 +414,25 @@ ; GFX10-NEXT: ds_write_b16 v0, v4 offset:4 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v4 offset:6 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s1 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 +; GFX11-NEXT: ds_store_b16 v0, v2 offset:12 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:14 +; GFX11-NEXT: ds_store_b16 v0, v3 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:2 +; GFX11-NEXT: ds_store_b16 v0, v4 offset:4 +; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v4 offset:6 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2 ret void } @@ -424,6 +496,19 @@ ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:2 offset1:3 ; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset1:1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_mov_b32_e32 v4, s3 +; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 +; GFX11-NEXT: ds_store_2addr_b32 v0, v3, v4 offset0:2 offset1:3 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 4 ret void } @@ -483,6 +568,19 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: ds_store_2addr_b64 v4, v[0:1], v[2:3] offset1:1 +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 8 ret void } @@ -542,6 +640,18 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write_b128 v4, v[0:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v4i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: ds_store_b128 v4, v[0:3] +; GFX11-NEXT: s_endpgm store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { ; GFX9-LABEL: store_lds_v3i32: @@ -56,6 +57,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: ds_store_b96 v3, v[0:2] +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out ret void } @@ -215,6 +227,38 @@ ; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 ; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s2, 8 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 +; GFX11-NEXT: s_lshr_b32 s4, s0, 8 +; GFX11-NEXT: s_lshr_b32 s0, s0, 24 +; GFX11-NEXT: s_lshr_b32 s5, s1, 8 +; GFX11-NEXT: s_lshr_b32 s1, s1, 24 +; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s0 +; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s1 +; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 +; GFX11-NEXT: ds_store_b8 v0, v2 +; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 +; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:2 +; GFX11-NEXT: ds_store_b8 v0, v6 offset:1 +; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 +; GFX11-NEXT: ds_store_b8 v0, v7 offset:3 +; GFX11-NEXT: ds_store_b8 v0, v8 offset:5 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 +; GFX11-NEXT: ds_store_b8 v0, v9 offset:7 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void } @@ -302,6 +346,22 @@ ; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 ; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 +; GFX11-NEXT: ds_store_b16 v0, v2 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 +; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 +; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 +; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 ret void } @@ -361,6 +421,18 @@ ; GFX10-NEXT: ds_write_b32 v0, v1 offset:8 ; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 +; GFX11-NEXT: ds_store_b32 v0, v3 offset:8 +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 ret void } @@ -420,6 +492,18 @@ ; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: ds_store_b32 v2, v3 offset:8 +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 ret void } @@ -476,6 +560,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s2 ; GFX10-NEXT: ds_write_b96 v3, v[0:2] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: store_lds_v3i32_align16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 +; GFX11-NEXT: ds_store_b96 v3, v[0:2] +; GFX11-NEXT: s_endpgm store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,FIJI %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: @@ -34,6 +35,16 @@ ; GFX10-NEXT: ds_write_b32 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: local_store_i56: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:6 +; GFX11-NEXT: ds_store_b16 v0, v2 offset:4 +; GFX11-NEXT: ds_store_b32 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store i56 %arg, i56 addrspace(3)* %ptr, align 8 ret void } @@ -124,6 +135,27 @@ ; GFX10-NEXT: ds_write_b8_d16_hi v1, v0 offset:6 ; GFX10-NEXT: ds_write_b32 v1, v3 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: local_store_i55: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0xc +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s1, s2, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s2 +; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 +; GFX11-NEXT: ds_store_b16 v1, v2 offset:4 +; GFX11-NEXT: ds_store_b32 v1, v3 +; GFX11-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -183,6 +215,19 @@ ; GFX10-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX10-NEXT: ds_write_b32 v0, v2 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: local_store_i48: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0xc +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 +; GFX11-NEXT: ds_store_b32 v0, v2 +; GFX11-NEXT: s_endpgm store i48 %arg, i48 addrspace(3)* %ptr, align 8 ret void } @@ -250,6 +295,21 @@ ; GFX10-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: local_store_i65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s2, s2, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: ds_store_b8 v2, v3 offset:8 +; GFX11-NEXT: ds_store_b64 v2, v[0:1] +; GFX11-NEXT: s_endpgm store i65 %arg, i65 addrspace(3)* %ptr, align 8 ret void } @@ -280,6 +340,15 @@ ; GFX10-NEXT: ds_write_b16 v0, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: local_store_i13: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX11-NEXT: ds_store_b16 v0, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store i13 %arg, i13 addrspace(3)* %ptr, align 8 ret void } @@ -313,6 +382,16 @@ ; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: local_store_i17: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 +; GFX11-NEXT: ds_store_b16 v0, v1 +; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:2 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 { @@ -11,12 +12,12 @@ ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -28,12 +29,12 @@ ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f16_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -45,12 +46,12 @@ ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f16_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -70,12 +71,12 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -95,12 +96,12 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val } @@ -120,12 +121,12 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_v2f16_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val } @@ -147,13 +148,13 @@ ; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_v3f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_v3f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX10PLUS-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -195,6 +196,24 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-NEXT: v_add_f16_e32 v2, v5, v4 +; GFX11-NEXT: v_add_f16_e32 v3, v7, v6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -206,10 +225,10 @@ ; GCN-NEXT: v_add_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fadd_f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_f16_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fadd_f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_add_f16_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -233,10 +252,10 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fadd_v2f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_add_f16 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fadd_v2f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_add_f16 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fadd.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 { ; GCN-LABEL: v_constained_fadd_f32_fpexcept_strict: @@ -9,12 +10,12 @@ ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -26,12 +27,12 @@ ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f32_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret float %val } @@ -43,12 +44,12 @@ ; GCN-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f32_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret float %val } @@ -68,6 +69,13 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fadd_v2f32_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -87,6 +95,13 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fadd_v2f32_fpexcept_ignore: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %val } @@ -106,6 +121,13 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fadd_v2f32_fpexcept_maytrap: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x float> %val } @@ -127,6 +149,14 @@ ; GFX10-NEXT: v_add_f32_e32 v1, v1, v4 ; GFX10-NEXT: v_add_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fadd_v3f32_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_add_f32 v0, v0, v3 :: v_dual_add_f32 v1, v1, v4 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -138,10 +168,10 @@ ; GCN-NEXT: v_add_f32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fadd_f32_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_f32_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fadd_f32_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_add_f32_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -153,12 +183,12 @@ ; GCN-NEXT: v_add_f32_e64 v0, |v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_lhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e64 v0, |v0|, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_lhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %val = call float @llvm.experimental.constrained.fadd.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -171,12 +201,12 @@ ; GCN-NEXT: v_add_f32_e64 v0, v0, |v1| ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_rhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_add_f32_e64 v0, v0, |v1| -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fabs_rhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_add_f32_e64 v0, v0, |v1| +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -189,12 +219,12 @@ ; GCN-NEXT: v_sub_f32_e64 v0, v1, |v0| ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e64 v0, v1, |v0| -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fadd_f32_fpexcept_strict_fneg_fabs_lhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x %val = call float @llvm.experimental.constrained.fadd.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fadd_f64_fpexcept_strict(double %x, double %y) #0 { ; GCN-LABEL: v_constained_fadd_f64_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -2,6 +2,7 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10 %s define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 { ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z) #0 { ; GCN-LABEL: v_constained_fma_f32_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double %z) #0 { ; GCN-LABEL: v_constained_fma_f64_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 { @@ -11,12 +12,12 @@ ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -28,12 +29,12 @@ ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f16_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -45,12 +46,12 @@ ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f16_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -70,12 +71,12 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -95,12 +96,12 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val } @@ -120,12 +121,12 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_v2f16_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_mul_f16 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val } @@ -147,13 +148,13 @@ ; GFX8-NEXT: v_mul_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_v3f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX10-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_v3f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10PLUS-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fmul.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -195,6 +196,24 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fmul_v4f16_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX11-NEXT: v_mul_f16_e32 v2, v5, v4 +; GFX11-NEXT: v_mul_f16_e32 v3, v7, v6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -206,10 +225,10 @@ ; GCN-NEXT: v_mul_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fmul_f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mul_f16_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mul_f16_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -233,10 +252,10 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fmul_v2f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_pk_mul_f16 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fmul_v2f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_pk_mul_f16 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fmul.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define float @v_constained_fmul_f32_fpexcept_strict(float %x, float %y) #0 { ; GCN-LABEL: v_constained_fmul_f32_fpexcept_strict: @@ -9,12 +10,12 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -26,12 +27,12 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f32_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret float %val } @@ -43,12 +44,12 @@ ; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f32_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret float %val } @@ -68,6 +69,13 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fmul_v2f32_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -87,6 +95,13 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fmul_v2f32_fpexcept_ignore: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %val } @@ -106,6 +121,13 @@ ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fmul_v2f32_fpexcept_maytrap: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x float> %val } @@ -127,6 +149,14 @@ ; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 ; GFX10-NEXT: v_mul_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fmul_v3f32_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v3 :: v_dual_mul_f32 v1, v1, v4 +; GFX11-NEXT: v_mul_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fmul.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -138,10 +168,10 @@ ; GCN-NEXT: v_mul_f32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fmul_f32_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mul_f32_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fmul_f32_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_mul_f32_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -153,12 +183,12 @@ ; GCN-NEXT: v_mul_f32_e64 v0, |v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_lhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e64 v0, |v0|, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_lhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %val = call float @llvm.experimental.constrained.fmul.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -171,12 +201,12 @@ ; GCN-NEXT: v_mul_f32_e64 v0, v0, |v1| ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_rhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e64 v0, v0, |v1| -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict_fabs_rhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f32_e64 v0, v0, |v1| +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -189,12 +219,12 @@ ; GCN-NEXT: v_mul_f32_e64 v0, -|v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fmul_f32_fpexcept_strict_fneg_fabs_lhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mul_f32_e64 v0, -|v0|, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fmul_f32_fpexcept_strict_fneg_fabs_lhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x %val = call float @llvm.experimental.constrained.fmul.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fmul_f64_fpexcept_strict(double %x, double %y) #0 { ; GCN-LABEL: v_constained_fmul_f64_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 { @@ -11,12 +12,12 @@ ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -28,12 +29,12 @@ ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f16_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -45,12 +46,12 @@ ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f16_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -81,6 +82,18 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } @@ -111,6 +124,18 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x half> %val } @@ -141,6 +166,18 @@ ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-NEXT: v_sub_f16_e32 v1, v3, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x half> %val } @@ -174,6 +211,19 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v3f16_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-NEXT: v_sub_f16_e32 v2, v5, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fsub.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -215,6 +265,24 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-NEXT: v_sub_f16_e32 v2, v5, v4 +; GFX11-NEXT: v_sub_f16_e32 v3, v7, v6 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fsub.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -226,10 +294,10 @@ ; GCN-NEXT: v_sub_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fsub_f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_f16_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -258,15 +326,15 @@ ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fsub_v2f16_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_f16_e64 v0, s2, s3 -; GFX10-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-NEXT: s_lshr_b32 s1, s2, 16 -; GFX10-NEXT: v_sub_f16_e64 v1, s1, s0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fsub_v2f16_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX10PLUS-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, 16 +; GFX10PLUS-NEXT: v_sub_f16_e64 v1, s1, s0 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10PLUS-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val } diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f32.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define float @v_constained_fsub_f32_fpexcept_strict(float %x, float %y) #0 { ; GCN-LABEL: v_constained_fsub_f32_fpexcept_strict: @@ -9,12 +10,12 @@ ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -26,12 +27,12 @@ ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f32_fpexcept_ignore: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_ignore: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret float %val } @@ -43,12 +44,12 @@ ; GCN-NEXT: v_sub_f32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f32_fpexcept_maytrap: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_maytrap: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret float %val } @@ -68,6 +69,13 @@ ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v2f32_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x float> %val } @@ -87,6 +95,13 @@ ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v2f32_fpexcept_ignore: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret <2 x float> %val } @@ -106,6 +121,13 @@ ; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v2f32_fpexcept_maytrap: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret <2 x float> %val } @@ -127,6 +149,14 @@ ; GFX10-NEXT: v_sub_f32_e32 v1, v1, v4 ; GFX10-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constained_fsub_v3f32_fpexcept_strict: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_sub_f32 v0, v0, v3 :: v_dual_sub_f32 v1, v1, v4 +; GFX11-NEXT: v_sub_f32_e32 v2, v2, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val = call <3 x float> @llvm.experimental.constrained.fsub.v3f32(<3 x float> %x, <3 x float> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x float> %val } @@ -138,10 +168,10 @@ ; GCN-NEXT: v_sub_f32_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: s_constained_fsub_f32_fpexcept_strict: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_sub_f32_e64 v0, s2, s3 -; GFX10-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: s_constained_fsub_f32_fpexcept_strict: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_sub_f32_e64 v0, s2, s3 +; GFX10PLUS-NEXT: ; return to shader part epilog %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val } @@ -153,12 +183,12 @@ ; GCN-NEXT: v_sub_f32_e64 v0, |v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_lhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e64 v0, |v0|, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_lhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e64 v0, |v0|, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %val = call float @llvm.experimental.constrained.fsub.f32(float %fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -171,12 +201,12 @@ ; GCN-NEXT: v_sub_f32_e64 v0, v0, |v1| ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_rhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e64 v0, v0, |v1| -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict_fabs_rhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.y = call float @llvm.fabs.f32(float %y) %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %fabs.y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret float %val @@ -189,12 +219,12 @@ ; GCN-NEXT: v_sub_f32_e64 v0, -|v0|, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_constained_fsub_f32_fpexcept_strict_fneg_fabs_lhs: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_f32_e64 v0, -|v0|, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_constained_fsub_f32_fpexcept_strict_fneg_fabs_lhs: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_f32_e64 v0, -|v0|, v1 +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %fabs.x = call float @llvm.fabs.f32(float %x) %neg.fabs.x = fneg float %fabs.x %val = call float @llvm.experimental.constrained.fsub.f32(float %neg.fabs.x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s define double @v_constained_fsub_f64_fpexcept_strict(double %x, double %y) #0 { ; GCN-LABEL: v_constained_fsub_f64_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11 ; FIXME: Need to handle non-uniform case for function below (load without gep). define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { @@ -62,6 +63,24 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -128,6 +147,22 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_sub_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1 %add = sub <2 x i16> %a, %b @@ -155,6 +190,17 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_sub_self_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0 %add = sub <2 x i16> %a, %a store <2 x i16> %add, <2 x i16> addrspace(1)* %out @@ -204,6 +250,19 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_test_sub_v2i16_kernarg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %add = sub <2 x i16> %a, %b store <2 x i16> %add, <2 x i16> addrspace(1)* %out ret void @@ -256,6 +315,20 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -313,6 +386,20 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_neg_constant: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -368,6 +455,20 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -422,6 +523,20 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -478,6 +593,20 @@ ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -549,6 +678,27 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -631,6 +781,30 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v2, 0, 16, v2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -707,6 +881,27 @@ ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid @@ -789,6 +984,31 @@ ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll --- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll +++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s ; GCN-LABEL: {{^}}sub_var_var_i1: diff --git a/llvm/test/CodeGen/AMDGPU/subvector-test.mir b/llvm/test/CodeGen/AMDGPU/subvector-test.mir --- a/llvm/test/CodeGen/AMDGPU/subvector-test.mir +++ b/llvm/test/CodeGen/AMDGPU/subvector-test.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx1010 -start-before=greedy -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -start-before=greedy -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s ... # GCN-LABEL: {{^}}"subvector-basic-bb" # GCN: s_subvector_loop_begin [[RS:s[0-9]]], .LBB0_2 diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir --- a/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-fma.mir @@ -1,4 +1,5 @@ # RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: test_fmamk_reg_imm_f32 # GCN: %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck --check-prefix=GFX6 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-LABEL: v_usubsat_i8: @@ -26,14 +27,14 @@ ; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_i8: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result } @@ -60,12 +61,12 @@ ; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) ret i16 %result } @@ -94,12 +95,12 @@ ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: usubsat_as_bithack_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: usubsat_as_bithack_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = xor i16 %x, 32768 %result = and i16 %signsplat, %flipsign @@ -130,12 +131,12 @@ ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: usubsat_as_bithack2_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: usubsat_as_bithack2_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 %result = and i16 %signsplat, %flipsign @@ -166,12 +167,12 @@ ; GFX9-NEXT: v_sub_u16_e64 v0, v0, s4 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: usubsat_as_bithack_commute_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: usubsat_as_bithack_commute_i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, 0x8000 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %signsplat = ashr i16 %x, 15 %flipsign = add i16 %x, 32768 %result = and i16 %flipsign, %signsplat @@ -198,12 +199,12 @@ ; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) ret i32 %result } @@ -239,12 +240,12 @@ ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v2i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v1 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) ret <2 x i16> %result } @@ -286,13 +287,13 @@ ; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v3i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp -; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v3i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) ret <3 x i16> %result } @@ -341,13 +342,13 @@ ; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_pk_sub_u16 v0, v0, v2 clamp -; GFX10-NEXT: v_pk_sub_u16 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v4i16: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) %cast = bitcast <4 x i16> %result to <2 x float> ret <2 x float> %cast @@ -377,13 +378,13 @@ ; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v2i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) ret <2 x i32> %result } @@ -416,14 +417,14 @@ ; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v3i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) ret <3 x i32> %result } @@ -460,15 +461,15 @@ ; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v4i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) ret <4 x i32> %result } @@ -521,19 +522,19 @@ ; GFX9-NEXT: v_sub_u32_e64 v7, v7, v15 clamp ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_v8i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp -; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_v8i32: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v8 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v9 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v10 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v11 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, v4, v12 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, v5, v13 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, v6, v14 clamp +; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, v7, v15 clamp +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> %lhs, <8 x i32> %rhs) ret <8 x i32> %result } @@ -647,6 +648,30 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_usubsat_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp +; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) ret <16 x i32> %result } @@ -683,16 +708,16 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_usubsat_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 -; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10PLUS-LABEL: v_usubsat_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX10PLUS-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) { ; GFX9-LABEL: shuffle_v4f16_23uu: @@ -17,6 +18,14 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_23uu: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -46,6 +55,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_234u: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -67,6 +85,14 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_u1u3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -90,6 +116,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_u3u1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -111,6 +146,14 @@ ; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_u3uu: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -140,6 +183,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_3u6u: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -169,6 +223,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_3uu7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -204,6 +269,21 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_35u5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -239,6 +319,22 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_357u: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -262,6 +358,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_0101: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -283,6 +388,14 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_0123: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -312,6 +425,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_0145: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -341,6 +463,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_0167: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -364,6 +495,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_2301: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -387,6 +527,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_2323: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -416,6 +565,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_2345: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -445,6 +603,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_2367: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -474,6 +641,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_4501: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -503,6 +681,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_4523: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v2, v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -526,6 +715,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_4545: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -547,6 +745,14 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_4567: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -576,6 +782,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_6701: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -605,6 +822,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_6723: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -628,6 +856,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_6745: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[1:2], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -651,6 +888,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_6767: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[2:3], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -684,6 +930,20 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_2356: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -717,6 +977,20 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_5623: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -752,6 +1026,24 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v5, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_3456: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:4 +; GFX11-NEXT: global_load_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -787,6 +1079,24 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_5634: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -824,6 +1134,25 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_5734: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v4 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -857,6 +1186,19 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4i16_2356: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> @@ -886,6 +1228,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4i16_0167: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: global_load_b32 v1, v[2:3], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 %shuffle = shufflevector <4 x i16> %val0, <4 x i16> %val1, <4 x i32> @@ -913,6 +1264,18 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_0000: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> zeroinitializer @@ -942,6 +1305,20 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_1010: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -973,6 +1350,21 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_1100: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[1:2], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -1006,6 +1398,21 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_6161: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -1033,6 +1440,18 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_2333: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -1060,6 +1479,18 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_6667: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -1083,6 +1514,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v8f16_0101: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -1104,6 +1544,14 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v8f16_0123: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -1133,6 +1581,15 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v8f16_4589: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off offset:8 +; GFX11-NEXT: global_load_b32 v1, v[2:3], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -1162,6 +1619,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v8f16_10_11_2_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v2, v[2:3], off offset:4 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -1195,6 +1663,20 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v4 ; GFX10-NEXT: v_lshl_or_b32 v0, v6, 16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v8f16_13_14_2_3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off offset:8 +; GFX11-NEXT: global_load_b32 v1, v[0:1], off offset:4 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 %shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> @@ -1220,6 +1702,17 @@ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v3f16_0122: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0 %val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1 %shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> @@ -1247,6 +1740,18 @@ ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v1, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v2f16_0122: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v0, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0 %val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1 %shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> @@ -1284,6 +1789,25 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v6f16_452367: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 +; GFX11-NEXT: global_load_b96 v[4:6], v[2:3], off +; GFX11-NEXT: scratch_load_b128 v[0:3], off, s32 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: scratch_store_b96 off, v[4:6], s32 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:16 +; GFX11-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v3 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 %val1 = load <6 x half>, <6 x half> addrspace(1)* %arg1 %shuffle = shufflevector <6 x half> %val0, <6 x half> %val1, <6 x i32> @@ -1326,6 +1850,27 @@ ; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] ; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: fma_shuffle: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] +; GFX11-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] +; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp12 = zext i32 %tmp1 to i64 @@ -1388,6 +1933,22 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 ; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: shuffle_v4f16_0456: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 %shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> @@ -1422,6 +1983,19 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: shuffle_scalar_load_v8i32_0123: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 %id = shufflevector <8 x i32> %ld8, <8 x i32> undef, <4 x i32> store <4 x i32> %id, <4 x i32> addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: @@ -34,6 +35,37 @@ ; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 ; GCN-NEXT: s_mov_b32 exec_lo, s5 ; GCN-NEXT: s_branch .LBB0_1 +; +; GFX11-LABEL: vgpr_descriptor_waterfall_loop_idom_update: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB0_1: ; %bb0 +; GFX11-NEXT: ; =>This Loop Header: Depth=1 +; GFX11-NEXT: ; Child Loop BB0_2 Depth 2 +; GFX11-NEXT: flat_load_b128 v[2:5], v[0:1] +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: .LBB0_2: ; Parent Loop BB0_1 Depth=1 +; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_readfirstlane_b32 s4, v2 +; GFX11-NEXT: v_readfirstlane_b32 s5, v3 +; GFX11-NEXT: v_readfirstlane_b32 s6, v4 +; GFX11-NEXT: v_readfirstlane_b32 s7, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[2:3] +; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_and_saveexec_b32 s0, s0 +; GFX11-NEXT: buffer_store_b32 v0, v0, s[4:7], 0 offen +; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 +; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB0_2 +; GFX11-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 +; GFX11-NEXT: s_branch .LBB0_1 entry: br label %bb0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s declare void @extern_func() #2 @@ -133,6 +134,66 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: non_preserved_vgpr_tuple8: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 +; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 +; GFX11-NEXT: v_mov_b32_e32 v32, v12 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_mov_b32 s1, s0 +; GFX11-NEXT: s_mov_b32 s2, s0 +; GFX11-NEXT: s_mov_b32 s3, s0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s0 +; GFX11-NEXT: s_mov_b32 s6, s0 +; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42 +; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -303,6 +364,77 @@ ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: call_preserved_vgpr_tuple8: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: s_mov_b32 s33, s32 +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 +; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_mov_b32_e32 v45, v12 +; GFX11-NEXT: v_writelane_b32 v40, s36, 2 +; GFX11-NEXT: s_mov_b32 s36, 0 +; GFX11-NEXT: v_writelane_b32 v40, s37, 3 +; GFX11-NEXT: s_mov_b32 s37, s36 +; GFX11-NEXT: v_writelane_b32 v40, s38, 4 +; GFX11-NEXT: s_mov_b32 s38, s36 +; GFX11-NEXT: v_writelane_b32 v40, s39, 5 +; GFX11-NEXT: s_mov_b32 s39, s36 +; GFX11-NEXT: v_writelane_b32 v40, s40, 6 +; GFX11-NEXT: s_mov_b32 s40, s36 +; GFX11-NEXT: v_writelane_b32 v40, s41, 7 +; GFX11-NEXT: s_mov_b32 s41, s36 +; GFX11-NEXT: v_writelane_b32 v40, s42, 8 +; GFX11-NEXT: s_mov_b32 s42, s36 +; GFX11-NEXT: v_writelane_b32 v40, s43, 9 +; GFX11-NEXT: s_mov_b32 s43, s36 +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: s_clause 0x4 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 +; GFX11-NEXT: v_readlane_b32 s43, v40, 9 +; GFX11-NEXT: v_readlane_b32 s42, v40, 8 +; GFX11-NEXT: v_readlane_b32 s41, v40, 7 +; GFX11-NEXT: v_readlane_b32 s40, v40, 6 +; GFX11-NEXT: v_readlane_b32 s39, v40, 5 +; GFX11-NEXT: v_readlane_b32 s38, v40, 4 +; GFX11-NEXT: v_readlane_b32 s37, v40, 3 +; GFX11-NEXT: v_readlane_b32 s36, v40, 2 +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 +; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: s_or_saveexec_b32 s0, -1 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload +; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-overflow.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s # RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX10 %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX11 %s --- | define amdgpu_kernel void @max-counter-lgkmcnt() #0 { ret void } @@ -79,6 +80,36 @@ ; GFX10-NEXT: S_WAITCNT 52863 ; GFX10-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: max-counter-lgkmcnt + ; GFX11: S_WAITCNT 0 + ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX11-NEXT: $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec + ; GFX11-NEXT: $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec + ; GFX11-NEXT: $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec + ; GFX11-NEXT: $vgpr6_vgpr7 = DS_READ2_B32_gfx9 renamable $vgpr99, 6, 7, 0, implicit $exec + ; GFX11-NEXT: $vgpr8_vgpr9 = DS_READ2_B32_gfx9 renamable $vgpr99, 8, 9, 0, implicit $exec + ; GFX11-NEXT: $vgpr10_vgpr11 = DS_READ2_B32_gfx9 renamable $vgpr99, 10, 11, 0, implicit $exec + ; GFX11-NEXT: $vgpr12_vgpr13 = DS_READ2_B32_gfx9 renamable $vgpr99, 12, 13, 0, implicit $exec + ; GFX11-NEXT: $vgpr14_vgpr15 = DS_READ2_B32_gfx9 renamable $vgpr99, 14, 15, 0, implicit $exec + ; GFX11-NEXT: $vgpr16_vgpr17 = DS_READ2_B32_gfx9 renamable $vgpr99, 16, 17, 0, implicit $exec + ; GFX11-NEXT: $vgpr18_vgpr19 = DS_READ2_B32_gfx9 renamable $vgpr99, 18, 19, 0, implicit $exec + ; GFX11-NEXT: $vgpr20_vgpr21 = DS_READ2_B32_gfx9 renamable $vgpr99, 20, 21, 0, implicit $exec + ; GFX11-NEXT: $vgpr22_vgpr23 = DS_READ2_B32_gfx9 renamable $vgpr99, 22, 23, 0, implicit $exec + ; GFX11-NEXT: $vgpr24_vgpr25 = DS_READ2_B32_gfx9 renamable $vgpr99, 24, 25, 0, implicit $exec + ; GFX11-NEXT: $vgpr26_vgpr27 = DS_READ2_B32_gfx9 renamable $vgpr99, 26, 27, 0, implicit $exec + ; GFX11-NEXT: $vgpr28_vgpr29 = DS_READ2_B32_gfx9 renamable $vgpr99, 28, 29, 0, implicit $exec + ; GFX11-NEXT: $vgpr30_vgpr31 = DS_READ2_B32_gfx9 renamable $vgpr99, 30, 31, 0, implicit $exec + ; GFX11-NEXT: $vgpr32_vgpr33 = DS_READ2_B32_gfx9 renamable $vgpr99, 32, 33, 0, implicit $exec + ; GFX11-NEXT: $vgpr34_vgpr35 = DS_READ2_B32_gfx9 renamable $vgpr99, 34, 35, 0, implicit $exec + ; GFX11-NEXT: S_WAITCNT 64791 + ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_WAITCNT 64775 + ; GFX11-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX11-NEXT: S_WAITCNT 64759 + ; GFX11-NEXT: $vgpr4 = V_MAC_F32_e32 0, $vgpr5, $vgpr4, implicit $mode, implicit $exec + ; GFX11-NEXT: S_WAITCNT 64743 + ; GFX11-NEXT: $vgpr6 = V_MAC_F32_e32 0, $vgpr7, $vgpr6, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 $vgpr0_vgpr1 = DS_READ2_B32_gfx9 renamable $vgpr99, 0, 1, 0, implicit $exec $vgpr2_vgpr3 = DS_READ2_B32_gfx9 renamable $vgpr99, 2, 3, 0, implicit $exec $vgpr4_vgpr5 = DS_READ2_B32_gfx9 renamable $vgpr99, 4, 5, 0, implicit $exec @@ -266,6 +297,82 @@ ; GFX10-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec ; GFX10-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: max-counter-vmcnt + ; GFX11: S_WAITCNT 0 + ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX11-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr4 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr5 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 20, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr6 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 24, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr7 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 28, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr8 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 32, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr9 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 36, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr10 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 40, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr11 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 44, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr12 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 48, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr13 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 52, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 56, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 60, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr16 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 64, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr17 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 68, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr18 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 72, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr19 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 76, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr20 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 80, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr21 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 84, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr22 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 88, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr23 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 92, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr24 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 96, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr25 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 100, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr26 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 104, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr27 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 108, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr28 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 112, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr29 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 116, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr30 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 120, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr31 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 124, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 128, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr33 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 132, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr34 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 136, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr35 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 140, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr36 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 144, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr37 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 148, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr38 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 152, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr39 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 156, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 160, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 164, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 168, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 172, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 176, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 180, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 184, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 188, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr48 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 192, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr49 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 196, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr50 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 200, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr51 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 204, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr52 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 208, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr53 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 212, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr54 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 216, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr55 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 220, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 224, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 228, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 232, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 236, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 240, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 244, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 248, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 252, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr64 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 256, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr65 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 260, 0, 0, 0, implicit $exec + ; GFX11-NEXT: $vgpr66 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 264, 0, 0, 0, implicit $exec + ; GFX11-NEXT: S_WAITCNT 64503 + ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr1 = V_MAC_F32_e32 0, $vgpr2, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2 = V_MAC_F32_e32 0, $vgpr3, $vgpr2, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr3 = V_MAC_F32_e32 0, $vgpr4, $vgpr3, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, implicit $exec $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, implicit $exec $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, implicit $exec @@ -376,6 +483,19 @@ ; GFX10-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec ; GFX10-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec ; GFX10-NEXT: S_ENDPGM 0 + ; GFX11-LABEL: name: max-counter-expcnt + ; GFX11: S_WAITCNT 0 + ; GFX11-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX11-NEXT: EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec + ; GFX11-NEXT: $vgpr0 = V_MAC_F32_e32 0, $vgpr1, $vgpr0, implicit $mode, implicit $exec + ; GFX11-NEXT: S_ENDPGM 0 EXP 0, $vgpr0, $vgpr0, $vgpr0, $vgpr0, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec EXP 0, $vgpr1, $vgpr1, $vgpr1, $vgpr1, -1, -1, 15, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; RUN: llc -march=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s ; RUN: llc -march=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_XOR3_B32 diff --git a/llvm/test/CodeGen/AMDGPU/xor_add.ll b/llvm/test/CodeGen/AMDGPU/xor_add.ll --- a/llvm/test/CodeGen/AMDGPU/xor_add.ll +++ b/llvm/test/CodeGen/AMDGPU/xor_add.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s ; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s ; =================================================================================== ; V_XAD_U32