diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -436,6 +436,28 @@
 let SubtargetPredicate = HasDot8Insts  in {
 defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
 defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
+
+let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_I32_IU8").SubtargetPredicate in
+def : GCNPat <
+  !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
+                    (add_oneuse lhs, (!cast<PatFrag>("MulI_Elt"#y) i32:$src0, i32:$src1)))),
+  (!cast<VOP3P_Pseudo>("V_DOT4_I32_IU8") (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+
+let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_I32_IU4").SubtargetPredicate in
+def : GCNPat <
+  !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("MulI0_4bit") i32:$src0, i32:$src1)),
+                    [1, 2, 3, 4, 5, 6, 7], lhs, y,
+                    (NonACAdd_oneuse lhs, (!cast<PatFrag>("MulI"#y#"_4bit") i32:$src0, i32:$src1)))),
+  (!cast<VOP3P_Pseudo>("V_DOT8_I32_IU4") (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
+
+let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_I32_IU4").SubtargetPredicate in
+def : GCNPat <
+  !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("MulI0_4bit") i32:$src0, i32:$src1)),
+                    [7, 1, 2, 3, 4, 5, 6], lhs, y,
+                    (NonACAdd_oneuse lhs, (!cast<PatFrag>("MulI"#y#"_4bit") i32:$src0, i32:$src1)))),
+  (!cast<VOP3P_Pseudo>("V_DOT8_I32_IU4") (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
 } // End SubtargetPredicate = HasDot8Insts
 
 def : UDot2Pat<V_DOT2_U32_U16>;
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32:
@@ -127,6 +129,23 @@
 ; GFX10-DL-NEXT:    v_dot4_i32_i8 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_i32_iu8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -358,6 +377,45 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_bfe_i32 v4, v1, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v5, v5, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v6, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
+; GFX11-DL-NEXT:    v_bfe_i32 v4, v8, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v9, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -547,6 +605,35 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v2, v0, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -728,6 +815,40 @@
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_multiuse_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 8
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v4, v1, 8, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v5, v0, 8, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v1, 16, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 16, 8
+; GFX11-DL-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v8, v2, v3
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v4, v4, v5
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, v3, s2
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v3, v6, v7
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add3_u32 v1, v4, v2, v8
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    v_add3_u32 v0, v1, v3, v0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -917,6 +1038,41 @@
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v3, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v5, v1, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 0, 8
+; GFX11-DL-NEXT:    v_ashrrev_i32_e32 v4, 24, v1
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v2, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v3, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 16, 8
+; GFX11-DL-NEXT:    v_ashrrev_i32_e32 v7, 24, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 16, 8
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v5, v5, v6
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v2, v2, v3
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v1, v4, v7
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_add3_u32 v2, v5, s2, v2
+; GFX11-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1139,6 +1295,52 @@
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot4_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v1, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 8, v1
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 8, v0
+; GFX11-DL-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v1, v6, v1, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32:
@@ -127,6 +129,23 @@
 ; GFX10-DL-NEXT:    v_dot4_u32_u8 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -337,6 +356,43 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v0
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v8
+; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -527,6 +583,35 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v2, v0, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -680,6 +765,29 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v2
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot2_8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                    ptr addrspace(1) %src2,
                                    ptr addrspace(1) nocapture %dst) {
 entry:
@@ -851,6 +959,35 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_CommutationInsideMAD:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v0, v2, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v5, v4, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v7, v6, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v0, v2, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                       ptr addrspace(1) %src2,
                                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1034,6 +1171,36 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v3, v2, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_CommutationAccrossMADs:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v5, v4, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v0, v2, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v5, v4, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v0, v2, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                         ptr addrspace(1) %src2,
                                                         ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1216,6 +1383,40 @@
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_multiuse_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v1, 8, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v1, 16, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v8, v2, v3
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v5
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, v3, s2
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v3, v6, v7
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add3_u32 v1, v4, v2, v8
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    v_add3_u32 v0, v1, v3, v0
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1409,6 +1610,41 @@
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
 ; GFX10-DL-NEXT:    global_store_dword v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_multiuse_add1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v1, 8, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v1, 16, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 16, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v3, v4, v5, s2
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v4, v6, v7
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_add_nc_u32_e32 v1, s2, v3
+; GFX11-DL-NEXT:    v_add3_u32 v2, v3, v2, v4
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                ptr addrspace(1) %src2,
                                                ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1622,6 +1858,43 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: notdot4_mixedtypes:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_bfe_i32 v8, v1, 0, 8
+; GFX11-DL-NEXT:    v_bfe_i32 v9, v0, 0, 8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 0xff, v6
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v7
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v8, v9, v3
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1803,6 +2076,41 @@
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v3, v1
 ; GFX10-DL-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v2, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v3, 8, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xff, v1
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX11-DL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX11-DL-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v6
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, v2, v3
+; GFX11-DL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v0, v1, v0
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v1, v4, v7
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_add3_u32 v2, v5, s2, v2
+; GFX11-DL-NEXT:    v_add3_u32 v0, v2, v0, v1
+; GFX11-DL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2020,6 +2328,50 @@
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_lshrrev_b16 v4, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b16 v5, 8, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v7, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
+; GFX11-DL-NEXT:    v_perm_b32 v5, v5, v6, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v9
+; GFX11-DL-NEXT:    v_and_b32_e32 v7, 0xff, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2212,6 +2564,52 @@
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v2
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot4_acc8_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b16 v8, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b16 v9, 8, v0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v0, v3
+; GFX11-DL-NEXT:    v_mul_lo_u16 v5, v5, v6
+; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v4, v7
+; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v8, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v6, 0xff, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v6, v5
+; GFX11-DL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v6
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v7, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -7,6 +7,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc32:
@@ -228,6 +230,23 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s2
 ; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot8_i32_iu4 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -750,6 +769,81 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v17, 12, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX11-DL-NEXT:    v_lshlrev_b16 v16, 12, v16
+; GFX11-DL-NEXT:    v_ashrrev_i16 v17, 12, v17
+; GFX11-DL-NEXT:    v_lshlrev_b16 v9, 12, v9
+; GFX11-DL-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v10, 12, v10
+; GFX11-DL-NEXT:    v_ashrrev_i16 v16, 12, v16
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v1, v1, v17, v3
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v9
+; GFX11-DL-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX11-DL-NEXT:    v_lshlrev_b16 v9, 12, v15
+; GFX11-DL-NEXT:    v_mad_u16 v1, v10, v16, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v7, 12, v7
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 12, v14
+; GFX11-DL-NEXT:    v_ashrrev_i16 v8, 12, v8
+; GFX11-DL-NEXT:    v_ashrrev_i16 v9, 12, v9
+; GFX11-DL-NEXT:    v_mad_u16 v0, v3, v0, v1
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v7
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v10
+; GFX11-DL-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX11-DL-NEXT:    v_lshlrev_b16 v7, 12, v13
+; GFX11-DL-NEXT:    v_mad_u16 v0, v8, v9, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 12, v5
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 12, v12
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 12, v6
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v3, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v5
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v8
+; GFX11-DL-NEXT:    v_lshlrev_b16 v4, 12, v4
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 12, v11
+; GFX11-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 12, v4
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v3, v0
+; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1271,6 +1365,81 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_mad_u16 v0, v4, v5, v0
 ; GFX10-DL-NOXNACK-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v16, 4, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v17, 12, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v13, 20, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX11-DL-NEXT:    v_lshlrev_b16 v16, 12, v16
+; GFX11-DL-NEXT:    v_ashrrev_i16 v17, 12, v17
+; GFX11-DL-NEXT:    v_lshlrev_b16 v9, 12, v9
+; GFX11-DL-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v10, 12, v10
+; GFX11-DL-NEXT:    v_ashrrev_i16 v16, 12, v16
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u16 v1, v1, v17, v3
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v9
+; GFX11-DL-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX11-DL-NEXT:    v_lshlrev_b16 v9, 12, v15
+; GFX11-DL-NEXT:    v_mad_u16 v1, v10, v16, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v7, 12, v7
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 12, v14
+; GFX11-DL-NEXT:    v_ashrrev_i16 v8, 12, v8
+; GFX11-DL-NEXT:    v_ashrrev_i16 v9, 12, v9
+; GFX11-DL-NEXT:    v_mad_u16 v0, v3, v0, v1
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v7
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v10
+; GFX11-DL-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX11-DL-NEXT:    v_lshlrev_b16 v7, 12, v13
+; GFX11-DL-NEXT:    v_mad_u16 v0, v8, v9, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 12, v5
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 12, v12
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 12, v6
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v3, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v5
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v8
+; GFX11-DL-NEXT:    v_lshlrev_b16 v4, 12, v4
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 12, v11
+; GFX11-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 12, v4
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v1, v3, v0
+; GFX11-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1659,6 +1828,55 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_add3_u32 v0, v1, v0, v5
 ; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_multiuses_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_i32 v2, v1, 0, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v3, v1, 4, 4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_bfe_i32 v4, v0, 4, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v5, v1, 8, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v0, 8, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v0, 0, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v8, v1, 12, 4
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v3, v3, v4
+; GFX11-DL-NEXT:    v_bfe_i32 v9, v0, 12, 4
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v4, v5, v6
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_i32_i24 v5, v2, v7, s2
+; GFX11-DL-NEXT:    v_bfe_i32 v6, v1, 16, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v10, v0, 16, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v11, v1, 20, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v12, v0, 20, 4
+; GFX11-DL-NEXT:    v_mad_i32_i24 v2, v2, v7, v5
+; GFX11-DL-NEXT:    v_bfe_i32 v7, v1, 24, 4
+; GFX11-DL-NEXT:    v_bfe_i32 v13, v0, 24, 4
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v8, v8, v9
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v6, v6, v10
+; GFX11-DL-NEXT:    v_add3_u32 v2, v2, v3, v4
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v3, v11, v12
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v4, v7, v13
+; GFX11-DL-NEXT:    v_ashrrev_i32_e32 v1, 28, v1
+; GFX11-DL-NEXT:    v_ashrrev_i32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_add3_u32 v2, v2, v8, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_mul_i32_i24_e32 v0, v1, v0
+; GFX11-DL-NEXT:    v_add3_u32 v1, v2, v3, v4
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add3_u32 v0, v1, v0, v5
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                 ptr addrspace(1) %src2,
                                                 ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1953,6 +2171,23 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_dot8_i32_i4 v0, v1, v0, s2
 ; GFX10-DL-NOXNACK-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot8_i32_iu4 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2496,6 +2731,99 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v3
 ; GFX10-DL-NOXNACK-NEXT:    global_store_short v2, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v12, 4, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v4, 12, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v11, 12, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 12, v5
+; GFX11-DL-NEXT:    v_lshlrev_b16 v12, 12, v12
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v14, 12, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 12, v4
+; GFX11-DL-NEXT:    v_ashrrev_i16 v11, 12, v11
+; GFX11-DL-NEXT:    v_ashrrev_i16 v12, 12, v12
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX11-DL-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX11-DL-NEXT:    v_lshlrev_b16 v7, 12, v7
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX11-DL-NEXT:    v_lshlrev_b16 v13, 12, v13
+; GFX11-DL-NEXT:    v_lshlrev_b16 v14, 12, v14
+; GFX11-DL-NEXT:    v_perm_b32 v11, v12, v11, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v4, v5, v4, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 20, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v16, 20, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 12, v6
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 12, v8
+; GFX11-DL-NEXT:    v_ashrrev_i16 v8, 12, v13
+; GFX11-DL-NEXT:    v_ashrrev_i16 v12, 12, v14
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v11
+; GFX11-DL-NEXT:    v_lshlrev_b16 v9, 12, v9
+; GFX11-DL-NEXT:    v_lshlrev_b16 v15, 12, v15
+; GFX11-DL-NEXT:    v_lshlrev_b16 v16, 12, v16
+; GFX11-DL-NEXT:    v_perm_b32 v8, v12, v8, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v9, 12, v9
+; GFX11-DL-NEXT:    v_ashrrev_i16 v11, 12, v15
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v4, 12, v16
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v8
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX11-DL-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v12, 12, v17
+; GFX11-DL-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX11-DL-NEXT:    v_perm_b32 v4, v4, v11, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v5, v9, v5, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v6
+; GFX11-DL-NEXT:    v_ashrrev_i16 v10, 12, v10
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 12, v12
+; GFX11-DL-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -3087,6 +3415,115 @@
 ; GFX10-DL-NOXNACK-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NOXNACK-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NOXNACK-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: idot8_acc8_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 28, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v10, 4, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v13, 28, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v14, 24, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v17, 4, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v0, 12, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v9, 12, v9
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 12, v8
+; GFX11-DL-NEXT:    v_lshlrev_b16 v5, 12, v5
+; GFX11-DL-NEXT:    v_lshlrev_b16 v16, 12, v16
+; GFX11-DL-NEXT:    v_lshlrev_b16 v15, 12, v15
+; GFX11-DL-NEXT:    v_lshlrev_b16 v12, 12, v12
+; GFX11-DL-NEXT:    v_ashrrev_i16 v1, 12, v1
+; GFX11-DL-NEXT:    v_ashrrev_i16 v0, 12, v0
+; GFX11-DL-NEXT:    v_ashrrev_i16 v9, 12, v9
+; GFX11-DL-NEXT:    v_ashrrev_i16 v8, 12, v8
+; GFX11-DL-NEXT:    v_ashrrev_i16 v5, 12, v5
+; GFX11-DL-NEXT:    v_ashrrev_i16 v16, 12, v16
+; GFX11-DL-NEXT:    v_ashrrev_i16 v15, 12, v15
+; GFX11-DL-NEXT:    v_ashrrev_i16 v12, 12, v12
+; GFX11-DL-NEXT:    v_mul_lo_u16 v1, v1, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 12, v10
+; GFX11-DL-NEXT:    v_lshlrev_b16 v17, 12, v17
+; GFX11-DL-NEXT:    v_mul_lo_u16 v0, v8, v15
+; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v9, v16
+; GFX11-DL-NEXT:    v_mul_lo_u16 v9, v5, v12
+; GFX11-DL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v7, 12, v7
+; GFX11-DL-NEXT:    v_lshlrev_b16 v6, 12, v6
+; GFX11-DL-NEXT:    v_lshlrev_b16 v3, 12, v3
+; GFX11-DL-NEXT:    v_and_b32_e32 v9, 0xff, v9
+; GFX11-DL-NEXT:    global_load_u8 v2, v4, s[0:1]
+; GFX11-DL-NEXT:    v_lshlrev_b16 v14, 12, v14
+; GFX11-DL-NEXT:    v_lshlrev_b16 v13, 12, v13
+; GFX11-DL-NEXT:    v_lshlrev_b16 v11, 12, v11
+; GFX11-DL-NEXT:    v_ashrrev_i16 v10, 12, v10
+; GFX11-DL-NEXT:    v_ashrrev_i16 v17, 12, v17
+; GFX11-DL-NEXT:    v_ashrrev_i16 v7, 12, v7
+; GFX11-DL-NEXT:    v_ashrrev_i16 v6, 12, v6
+; GFX11-DL-NEXT:    v_ashrrev_i16 v3, 12, v3
+; GFX11-DL-NEXT:    v_ashrrev_i16 v14, 12, v14
+; GFX11-DL-NEXT:    v_ashrrev_i16 v13, 12, v13
+; GFX11-DL-NEXT:    v_ashrrev_i16 v11, 12, v11
+; GFX11-DL-NEXT:    v_mul_lo_u16 v10, v10, v17
+; GFX11-DL-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX11-DL-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v6, v13
+; GFX11-DL-NEXT:    v_mul_lo_u16 v3, v3, v11
+; GFX11-DL-NEXT:    v_mul_lo_u16 v11, v7, v14
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 8, v10
+; GFX11-DL-NEXT:    v_or_b32_e32 v8, v8, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
+; GFX11-DL-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX11-DL-NEXT:    v_and_b32_e32 v11, 0xff, v11
+; GFX11-DL-NEXT:    v_and_b32_e32 v13, 0xffff, v10
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX11-DL-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-DL-NEXT:    v_or_b32_e32 v3, v9, v3
+; GFX11-DL-NEXT:    v_or_b32_e32 v9, v11, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_or_b32_e32 v11, v13, v0
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v11
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v2, v1, v2
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v3, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v9, v2, v10
+; GFX11-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v9, v8
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v5, v12, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v7, v14, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    global_store_b8 v4, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc32:
@@ -204,6 +206,23 @@
 ; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc32:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot8_u32_u4 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -522,6 +541,52 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc16:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u16 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 4, 4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 8, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 12, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 16, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 20, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 24, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
 entry:
@@ -840,6 +905,52 @@
 ; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc8:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 4, 4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 8, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 12, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 16, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 20, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 24, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1163,6 +1274,54 @@
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc4:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 4, 4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 8, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 12, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 16, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 20, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1470,6 +1629,54 @@
 ; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
 ; GFX10-DL-NEXT:    global_store_byte v1, v0, s[2:3]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_CommutationInsideMAD:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 15, v2
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 4, 4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 8, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 12, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v0, 16, 4
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 20, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_mad_u16 v3, v6, v7, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v3, v4, v5, v3
+; GFX11-DL-NEXT:    v_mad_u16 v0, v2, v0, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-DL-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                       ptr addrspace(1) %src2,
                                                       ptr addrspace(1) nocapture %dst) {
 entry:
@@ -1788,6 +1995,55 @@
 ; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_multiuses_mul1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v8, 15, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v9, 15, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v2, v1, 4, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v1, v1, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v10, v0, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v11, v0, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v12, v0, 12, 4
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    v_mad_u32_u24 v13, v8, v9, s2
+; GFX11-DL-NEXT:    v_bfe_u32 v14, v0, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v15, v0, 16, 4
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v11
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v12
+; GFX11-DL-NEXT:    v_mad_u32_u24 v2, v2, v10, v13
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v0, v0, 24, 4
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v15
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v14
+; GFX11-DL-NEXT:    v_add3_u32 v1, v2, v1, v7
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v10
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v0, v4, v0
+; GFX11-DL-NEXT:    v_mul_u32_u24_e32 v3, v8, v9
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add3_u32 v1, v1, v6, v5
+; GFX11-DL-NEXT:    v_add3_u32 v0, v1, v0, v2
+; GFX11-DL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
+; GFX11-DL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                                 ptr addrspace(1) %src2,
                                                 ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2059,6 +2315,23 @@
 ; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v1, v2, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc32_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot8_u32_u4 v0, v1, v0, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2383,6 +2656,73 @@
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
 ; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc16_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 15, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX11-DL-NEXT:    global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v0, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v8, v0, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
+; GFX11-DL-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v4, v7, v4, 0x5040100
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 8, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    v_perm_b32 v6, v9, v6, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v9, v0, 20, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v7
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v10, v5, 0x5040100
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v1, 24, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v0, 16, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v8
+; GFX11-DL-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v6
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v0, 24, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v4
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
 entry:
@@ -2749,6 +3089,80 @@
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc8_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v5, 15, v1
+; GFX11-DL-NEXT:    global_load_u8 v2, v4, s[0:1]
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_bfe_u32 v12, v0, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v3, v1, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
+; GFX11-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v1, v0, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v14, v0, 8, 4
+; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v6, v12
+; GFX11-DL-NEXT:    v_and_b32_e32 v13, 15, v0
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v15, 28, v0
+; GFX11-DL-NEXT:    v_bfe_u32 v16, v0, 24, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v12, v0, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v0, v0, 20, 4
+; GFX11-DL-NEXT:    v_mul_lo_u16 v7, v7, v14
+; GFX11-DL-NEXT:    v_mul_lo_u16 v1, v3, v1
+; GFX11-DL-NEXT:    v_lshlrev_b16 v3, 8, v6
+; GFX11-DL-NEXT:    v_mul_lo_u16 v8, v8, v15
+; GFX11-DL-NEXT:    v_mul_lo_u16 v0, v10, v0
+; GFX11-DL-NEXT:    v_mul_lo_u16 v6, v11, v12
+; GFX11-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX11-DL-NEXT:    v_or_b32_e32 v7, v7, v3
+; GFX11-DL-NEXT:    v_mul_lo_u16 v3, v9, v16
+; GFX11-DL-NEXT:    v_lshlrev_b16 v10, 8, v0
+; GFX11-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
+; GFX11-DL-NEXT:    v_and_b32_e32 v14, 0xffff, v1
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-DL-NEXT:    v_mul_lo_u16 v5, v5, v13
+; GFX11-DL-NEXT:    v_or_b32_e32 v6, v6, v10
+; GFX11-DL-NEXT:    v_or_b32_e32 v3, v3, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_or_b32_e32 v10, v14, v0
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v6, 8, v10
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v2, v1, v2
+; GFX11-DL-NEXT:    v_or_b32_e32 v1, v5, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v5, v2, v6
+; GFX11-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v5, v7
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v2
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v11, v12, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_mad_u16 v0, v9, v16, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v1
+; GFX11-DL-NEXT:    global_store_b8 v4, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:
@@ -3077,6 +3491,75 @@
 ; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_acc4_vecMul:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v4, 15, v1
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX11-DL-NEXT:    global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v0, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v8, v0, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
+; GFX11-DL-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
+; GFX11-DL-NEXT:    v_perm_b32 v4, v7, v4, 0x5040100
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v0, 8, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
+; GFX11-DL-NEXT:    v_perm_b32 v6, v9, v6, 0x5040100
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
+; GFX11-DL-NEXT:    v_bfe_u32 v9, v0, 20, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v7
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_perm_b32 v5, v10, v5, 0x5040100
+; GFX11-DL-NEXT:    v_bfe_u32 v7, v1, 24, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
+; GFX11-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v4, v3
+; GFX11-DL-NEXT:    v_bfe_u32 v4, v0, 16, 4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v8
+; GFX11-DL-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v6
+; GFX11-DL-NEXT:    v_bfe_u32 v6, v0, 24, 4
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v0
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v8
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_perm_b32 v0, v0, v6, 0x5040100
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-DL-NEXT:    v_add_nc_u16 v3, v3, v4
+; GFX11-DL-NEXT:    v_pk_mul_lo_u16 v0, v1, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-NEXT:    v_add_nc_u16 v1, v3, v5
+; GFX11-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v1, v0
+; GFX11-DL-NEXT:    v_add_nc_u16 v0, v0, v3
+; GFX11-DL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX11-DL-NEXT:    global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                              ptr addrspace(1) %src2,
                                              ptr addrspace(1) nocapture %dst) {
 entry:
@@ -3278,6 +3761,23 @@
 ; GFX10-DL-NEXT:    v_dot8_u32_u4 v1, v2, v1, s2
 ; GFX10-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
+;
+; GFX11-DL-LABEL: udot8_variant1:
+; GFX11-DL:       ; %bb.0: ; %entry
+; GFX11-DL-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-DL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DL-NEXT:    s_clause 0x1
+; GFX11-DL-NEXT:    global_load_b32 v1, v0, s[4:5]
+; GFX11-DL-NEXT:    global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-DL-NEXT:    v_dot8_u32_u4 v0, v0, v1, s2
+; GFX11-DL-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DL-NEXT:    s_endpgm
                                           ptr addrspace(1) %v2addr,
                                           ptr addrspace(1) %dst) {
 entry: