diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -436,6 +436,28 @@ let SubtargetPredicate = HasDot8Insts in { defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>; defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>; + +let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_I32_IU8").SubtargetPredicate in +def : GCNPat < + !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, + (add_oneuse lhs, (!cast<PatFrag>("MulI_Elt"#y) i32:$src0, i32:$src1)))), + (!cast<VOP3P_Pseudo>("V_DOT4_I32_IU8") (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + + +let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_I32_IU4").SubtargetPredicate in +def : GCNPat < + !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("MulI0_4bit") i32:$src0, i32:$src1)), + [1, 2, 3, 4, 5, 6, 7], lhs, y, + (NonACAdd_oneuse lhs, (!cast<PatFrag>("MulI"#y#"_4bit") i32:$src0, i32:$src1)))), + (!cast<VOP3P_Pseudo>("V_DOT8_I32_IU4") (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + + +let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_I32_IU4").SubtargetPredicate in +def : GCNPat < + !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("MulI0_4bit") i32:$src0, i32:$src1)), + [7, 1, 2, 3, 4, 5, 6], lhs, y, + (NonACAdd_oneuse lhs, (!cast<PatFrag>("MulI"#y#"_4bit") i32:$src0, i32:$src1)))), + (!cast<VOP3P_Pseudo>("V_DOT8_I32_IU4") (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; } // End SubtargetPredicate = HasDot8Insts def : UDot2Pat<V_DOT2_U32_U16>; diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot4_acc32: @@ -127,6 +129,23 @@ ; GFX10-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -358,6 +377,45 @@ ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc16: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v6, v6, 0, 8 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX11-DL-NEXT: v_bfe_i32 v4, v8, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v7, v9, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -547,6 +605,35 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v2, v0, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -728,6 +815,40 @@ ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_multiuse_mul1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v4, v1, 8, 8 +; GFX11-DL-NEXT: v_bfe_i32 v5, v0, 8, 8 +; GFX11-DL-NEXT: v_bfe_i32 v6, v1, 16, 8 +; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 16, 8 +; GFX11-DL-NEXT: v_ashrrev_i32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v8, v2, v3 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v4, v4, v5 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s2 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v3, v6, v7 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add3_u32 v1, v4, v2, v8 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: v_add3_u32 v0, v1, v3, v0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -917,6 +1038,41 @@ ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc32_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b16 v2, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_lshrrev_b16 v3, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v5, v1, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX11-DL-NEXT: v_ashrrev_i32_e32 v4, 24, v1 +; GFX11-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 16, 8 +; GFX11-DL-NEXT: v_ashrrev_i32_e32 v7, 24, v0 +; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 16, 8 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v5, v5, v6 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v2, v2, v3 +; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v0, v1, v0 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v1, v4, v7 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_add3_u32 v2, v5, s2, v2 +; GFX11-DL-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1139,6 +1295,52 @@ ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot4_acc16_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v6, v0, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 8, v1 +; GFX11-DL-NEXT: v_ashrrev_i16 v7, 8, v0 +; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_perm_b32 v0, v7, v0, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot4_acc32: @@ -127,6 +129,23 @@ ; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_acc32: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -337,6 +356,43 @@ ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_acc16: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_and_b32_e32 v7, 0xff, v0 +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-DL-NEXT: v_and_b32_e32 v7, 0xff, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -527,6 +583,35 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_acc8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v2, v0, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -680,6 +765,29 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v2 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot2_8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -851,6 +959,35 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_CommutationInsideMAD: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v0, v2, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v5, v4, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v7, v6, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v0, v2, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1034,6 +1171,36 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v3, v2, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_CommutationAccrossMADs: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v5, v4, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v0, v2, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v5, v4, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v0, v2, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1216,6 +1383,40 @@ ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_multiuse_mul1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-DL-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-DL-NEXT: v_bfe_u32 v6, v1, 16, 8 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v8, v2, v3 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v5 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s2 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v3, v6, v7 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add3_u32 v1, v4, v2, v8 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: v_add3_u32 v0, v1, v3, v0 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1409,6 +1610,41 @@ ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_multiuse_add1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-DL-NEXT: v_bfe_u32 v4, v1, 8, 8 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX11-DL-NEXT: v_bfe_u32 v6, v1, 16, 8 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 16, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_mad_u32_u24 v3, v4, v5, s2 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v4, v6, v7 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_add_nc_u32_e32 v1, s2, v3 +; GFX11-DL-NEXT: v_add3_u32 v2, v3, v2, v4 +; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1622,6 +1858,43 @@ ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: notdot4_mixedtypes: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: v_bfe_i32 v8, v1, 0, 8 +; GFX11-DL-NEXT: v_bfe_i32 v9, v0, 0, 8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v8, v9, v3 +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1803,6 +2076,41 @@ ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v3, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_acc32_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b16 v2, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_lshrrev_b16 v3, 8, v0 +; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX11-DL-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3 +; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v0, v1, v0 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v7 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_add3_u32 v2, v5, s2, v2 +; GFX11-DL-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2020,6 +2328,50 @@ ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_acc16_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1 +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: v_lshrrev_b16 v4, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b16 v5, 8, v0 +; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GFX11-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v9 +; GFX11-DL-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2212,6 +2564,52 @@ ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot4_acc8_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b16 v8, 8, v1 +; GFX11-DL-NEXT: v_lshrrev_b16 v9, 8, v0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3 +; GFX11-DL-NEXT: v_mul_lo_u16 v5, v5, v6 +; GFX11-DL-NEXT: v_mul_lo_u16 v6, v4, v7 +; GFX11-DL-NEXT: v_mul_lo_u16 v8, v8, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v5 +; GFX11-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-DL-NEXT: v_or_b32_e32 v6, v8, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v6 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -7,6 +7,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: idot8_acc32: @@ -228,6 +230,23 @@ ; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_acc32: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot8_i32_iu4 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -750,6 +769,81 @@ ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_acc16: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v16, 4, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v17, 12, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX11-DL-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX11-DL-NEXT: v_ashrrev_i16 v17, 12, v17 +; GFX11-DL-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX11-DL-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX11-DL-NEXT: v_ashrrev_i16 v16, 12, v16 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v1, v1, v17, v3 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v9 +; GFX11-DL-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX11-DL-NEXT: v_lshlrev_b16 v9, 12, v15 +; GFX11-DL-NEXT: v_mad_u16 v1, v10, v16, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 12, v14 +; GFX11-DL-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX11-DL-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX11-DL-NEXT: v_mad_u16 v0, v3, v0, v1 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v7 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v10 +; GFX11-DL-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX11-DL-NEXT: v_lshlrev_b16 v7, 12, v13 +; GFX11-DL-NEXT: v_mad_u16 v0, v8, v9, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 12, v12 +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX11-DL-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v5 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX11-DL-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 12, v11 +; GFX11-DL-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX11-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1271,6 +1365,81 @@ ; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_acc8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v16, 4, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v17, 12, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v13, 20, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX11-DL-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX11-DL-NEXT: v_ashrrev_i16 v17, 12, v17 +; GFX11-DL-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX11-DL-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX11-DL-NEXT: v_ashrrev_i16 v16, 12, v16 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_mad_u16 v1, v1, v17, v3 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v9 +; GFX11-DL-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX11-DL-NEXT: v_lshlrev_b16 v9, 12, v15 +; GFX11-DL-NEXT: v_mad_u16 v1, v10, v16, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 12, v14 +; GFX11-DL-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX11-DL-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX11-DL-NEXT: v_mad_u16 v0, v3, v0, v1 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v7 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v10 +; GFX11-DL-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX11-DL-NEXT: v_lshlrev_b16 v7, 12, v13 +; GFX11-DL-NEXT: v_mad_u16 v0, v8, v9, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 12, v12 +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX11-DL-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v5 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v8 +; GFX11-DL-NEXT: v_lshlrev_b16 v4, 12, v4 +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 12, v11 +; GFX11-DL-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v1, v3, v0 +; GFX11-DL-NEXT: v_mad_u16 v0, v4, v5, v0 +; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1659,6 +1828,55 @@ ; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_multiuses_mul1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 4 +; GFX11-DL-NEXT: v_bfe_i32 v3, v1, 4, 4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_bfe_i32 v4, v0, 4, 4 +; GFX11-DL-NEXT: v_bfe_i32 v5, v1, 8, 4 +; GFX11-DL-NEXT: v_bfe_i32 v6, v0, 8, 4 +; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 4 +; GFX11-DL-NEXT: v_bfe_i32 v8, v1, 12, 4 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v3, v3, v4 +; GFX11-DL-NEXT: v_bfe_i32 v9, v0, 12, 4 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v4, v5, v6 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_mad_i32_i24 v5, v2, v7, s2 +; GFX11-DL-NEXT: v_bfe_i32 v6, v1, 16, 4 +; GFX11-DL-NEXT: v_bfe_i32 v10, v0, 16, 4 +; GFX11-DL-NEXT: v_bfe_i32 v11, v1, 20, 4 +; GFX11-DL-NEXT: v_bfe_i32 v12, v0, 20, 4 +; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v7, v5 +; GFX11-DL-NEXT: v_bfe_i32 v7, v1, 24, 4 +; GFX11-DL-NEXT: v_bfe_i32 v13, v0, 24, 4 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v8, v8, v9 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v6, v6, v10 +; GFX11-DL-NEXT: v_add3_u32 v2, v2, v3, v4 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v3, v11, v12 +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v4, v7, v13 +; GFX11-DL-NEXT: v_ashrrev_i32_e32 v1, 28, v1 +; GFX11-DL-NEXT: v_ashrrev_i32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_add3_u32 v2, v2, v8, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_mul_i32_i24_e32 v0, v1, v0 +; GFX11-DL-NEXT: v_add3_u32 v1, v2, v3, v4 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_add3_u32 v0, v1, v0, v5 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1953,6 +2171,23 @@ ; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2 ; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_acc32_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot8_i32_iu4 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2496,6 +2731,99 @@ ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3 ; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_acc16_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v12, 4, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v4, 12, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v11, 12, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX11-DL-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v13, 8, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX11-DL-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX11-DL-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX11-DL-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX11-DL-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX11-DL-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX11-DL-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX11-DL-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v4, v5, v4, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v16, 20, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX11-DL-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 12, v8 +; GFX11-DL-NEXT: v_ashrrev_i16 v8, 12, v13 +; GFX11-DL-NEXT: v_ashrrev_i16 v12, 12, v14 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX11-DL-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX11-DL-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX11-DL-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX11-DL-NEXT: v_perm_b32 v8, v12, v8, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v17, 24, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX11-DL-NEXT: v_ashrrev_i16 v11, 12, v15 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v4, 12, v16 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v8 +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX11-DL-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v12, 12, v17 +; GFX11-DL-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX11-DL-NEXT: v_perm_b32 v4, v4, v11, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX11-DL-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 12, v12 +; GFX11-DL-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -3087,6 +3415,115 @@ ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_endpgm +; +; GFX11-DL-LABEL: idot8_acc8_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 20, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v10, 4, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v13, 28, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v14, 24, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v0, 12, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v9, 12, v9 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX11-DL-NEXT: v_lshlrev_b16 v5, 12, v5 +; GFX11-DL-NEXT: v_lshlrev_b16 v16, 12, v16 +; GFX11-DL-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX11-DL-NEXT: v_lshlrev_b16 v12, 12, v12 +; GFX11-DL-NEXT: v_ashrrev_i16 v1, 12, v1 +; GFX11-DL-NEXT: v_ashrrev_i16 v0, 12, v0 +; GFX11-DL-NEXT: v_ashrrev_i16 v9, 12, v9 +; GFX11-DL-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX11-DL-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX11-DL-NEXT: v_ashrrev_i16 v16, 12, v16 +; GFX11-DL-NEXT: v_ashrrev_i16 v15, 12, v15 +; GFX11-DL-NEXT: v_ashrrev_i16 v12, 12, v12 +; GFX11-DL-NEXT: v_mul_lo_u16 v1, v1, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX11-DL-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX11-DL-NEXT: v_mul_lo_u16 v0, v8, v15 +; GFX11-DL-NEXT: v_mul_lo_u16 v8, v9, v16 +; GFX11-DL-NEXT: v_mul_lo_u16 v9, v5, v12 +; GFX11-DL-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-DL-NEXT: v_lshlrev_b16 v7, 12, v7 +; GFX11-DL-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX11-DL-NEXT: v_lshlrev_b16 v3, 12, v3 +; GFX11-DL-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-DL-NEXT: global_load_u8 v2, v4, s[0:1] +; GFX11-DL-NEXT: v_lshlrev_b16 v14, 12, v14 +; GFX11-DL-NEXT: v_lshlrev_b16 v13, 12, v13 +; GFX11-DL-NEXT: v_lshlrev_b16 v11, 12, v11 +; GFX11-DL-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX11-DL-NEXT: v_ashrrev_i16 v17, 12, v17 +; GFX11-DL-NEXT: v_ashrrev_i16 v7, 12, v7 +; GFX11-DL-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX11-DL-NEXT: v_ashrrev_i16 v3, 12, v3 +; GFX11-DL-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX11-DL-NEXT: v_ashrrev_i16 v13, 12, v13 +; GFX11-DL-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX11-DL-NEXT: v_mul_lo_u16 v10, v10, v17 +; GFX11-DL-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX11-DL-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-DL-NEXT: v_mul_lo_u16 v6, v6, v13 +; GFX11-DL-NEXT: v_mul_lo_u16 v3, v3, v11 +; GFX11-DL-NEXT: v_mul_lo_u16 v11, v7, v14 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 8, v10 +; GFX11-DL-NEXT: v_or_b32_e32 v8, v8, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v6, 8, v6 +; GFX11-DL-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-DL-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-DL-NEXT: v_and_b32_e32 v13, 0xffff, v10 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX11-DL-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-DL-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX11-DL-NEXT: v_or_b32_e32 v9, v11, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_or_b32_e32 v11, v13, v0 +; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v11 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v2, v1, v2 +; GFX11-DL-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v9, v2, v10 +; GFX11-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v9, v8 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v5, v12, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v7, v14, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-DL-NEXT: global_store_b8 v4, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -5,6 +5,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX7-LABEL: udot8_acc32: @@ -204,6 +206,23 @@ ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc32: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot8_u32_u4 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -522,6 +541,52 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_short v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc16: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u16 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 4, 4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 8, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 12, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 16, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 20, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 24, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -840,6 +905,52 @@ ; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc8: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 4, 4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 8, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 12, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 16, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 20, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 24, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1163,6 +1274,54 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc4: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 4, 4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 8, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 12, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 16, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 20, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1470,6 +1629,54 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX10-DL-NEXT: global_store_byte v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_CommutationInsideMAD: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1] +; GFX11-DL-NEXT: s_waitcnt vmcnt(2) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 4, 4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 8, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 12, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v0, 16, 4 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 20, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3 +; GFX11-DL-NEXT: v_mad_u16 v0, v2, v0, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1788,6 +1995,55 @@ ; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_multiuses_mul1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v8, 15, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_and_b32_e32 v9, 15, v0 +; GFX11-DL-NEXT: v_bfe_u32 v2, v1, 4, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX11-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 +; GFX11-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v1, v1, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v10, v0, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v11, v0, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v12, v0, 12, 4 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2 +; GFX11-DL-NEXT: v_bfe_u32 v14, v0, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v15, v0, 16, 4 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12 +; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v10, v13 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14 +; GFX11-DL-NEXT: v_add3_u32 v1, v2, v1, v7 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v0, v4, v0 +; GFX11-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add3_u32 v1, v1, v6, v5 +; GFX11-DL-NEXT: v_add3_u32 v0, v1, v0, v2 +; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-DL-NEXT: v_add3_u32 v0, v3, v13, v0 +; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2059,6 +2315,23 @@ ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc32_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot8_u32_u4 v0, v1, v0, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2383,6 +2656,73 @@ ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc16_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1] +; GFX11-DL-NEXT: v_bfe_u32 v6, v0, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v8, v0, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX11-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX11-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 8, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX11-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v9, v0, 20, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 +; GFX11-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v0, 16, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX11-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX11-DL-NEXT: v_bfe_u32 v6, v0, 24, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 +; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2749,6 +3089,80 @@ ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc8_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v5, 15, v1 +; GFX11-DL-NEXT: global_load_u8 v2, v4, s[0:1] +; GFX11-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_bfe_u32 v12, v0, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 +; GFX11-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 +; GFX11-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX11-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v1, v0, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX11-DL-NEXT: v_mul_lo_u16 v6, v6, v12 +; GFX11-DL-NEXT: v_and_b32_e32 v13, 15, v0 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v0 +; GFX11-DL-NEXT: v_bfe_u32 v16, v0, 24, 4 +; GFX11-DL-NEXT: v_bfe_u32 v12, v0, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v0, v0, 20, 4 +; GFX11-DL-NEXT: v_mul_lo_u16 v7, v7, v14 +; GFX11-DL-NEXT: v_mul_lo_u16 v1, v3, v1 +; GFX11-DL-NEXT: v_lshlrev_b16 v3, 8, v6 +; GFX11-DL-NEXT: v_mul_lo_u16 v8, v8, v15 +; GFX11-DL-NEXT: v_mul_lo_u16 v0, v10, v0 +; GFX11-DL-NEXT: v_mul_lo_u16 v6, v11, v12 +; GFX11-DL-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-DL-NEXT: v_or_b32_e32 v7, v7, v3 +; GFX11-DL-NEXT: v_mul_lo_u16 v3, v9, v16 +; GFX11-DL-NEXT: v_lshlrev_b16 v10, 8, v0 +; GFX11-DL-NEXT: v_lshlrev_b16 v8, 8, v8 +; GFX11-DL-NEXT: v_and_b32_e32 v14, 0xffff, v1 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX11-DL-NEXT: v_mul_lo_u16 v5, v5, v13 +; GFX11-DL-NEXT: v_or_b32_e32 v6, v6, v10 +; GFX11-DL-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_or_b32_e32 v10, v14, v0 +; GFX11-DL-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v10 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v2, v1, v2 +; GFX11-DL-NEXT: v_or_b32_e32 v1, v5, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v5, v2, v6 +; GFX11-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v5, v7 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v2 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v11, v12, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_mad_u16 v0, v9, v16, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX11-DL-NEXT: global_store_b8 v4, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -3077,6 +3491,75 @@ ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_acc4_vecMul: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1] +; GFX11-DL-NEXT: v_bfe_u32 v6, v0, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX11-DL-NEXT: v_bfe_u32 v8, v0, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 +; GFX11-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX11-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX11-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 +; GFX11-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 +; GFX11-DL-NEXT: v_bfe_u32 v7, v0, 8, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX11-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 +; GFX11-DL-NEXT: v_bfe_u32 v9, v0, 20, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 +; GFX11-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX11-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX11-DL-NEXT: v_bfe_u32 v4, v0, 16, 4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX11-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v6 +; GFX11-DL-NEXT: v_bfe_u32 v6, v0, 24, 4 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v0 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_perm_b32 v0, v0, v6, 0x5040100 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 +; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 +; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 +; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-DL-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -3278,6 +3761,23 @@ ; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2 ; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-DL-NEXT: s_endpgm +; +; GFX11-DL-LABEL: udot8_variant1: +; GFX11-DL: ; %bb.0: ; %entry +; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DL-NEXT: s_clause 0x1 +; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-DL-NEXT: v_dot8_u32_u4 v0, v0, v1, s2 +; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-DL-NEXT: s_endpgm ptr addrspace(1) %v2addr, ptr addrspace(1) %dst) { entry: