diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2364,6 +2364,8 @@
 def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_tied_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_tied_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4252,6 +4252,8 @@
     case Intrinsic::amdgcn_sudot8:
     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_tied_bf16_16x16x16_bf16:
+    case Intrinsic::amdgcn_wmma_tied_f16_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -865,35 +865,26 @@
 //    it converts the default pseudo to the pseudo where src2 is not the same as vdst.
 // 3) @earlyclobber on the destination satisfies the constraint during RA.
 
-multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
+multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> {
 
   defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  if !eq(Suffix, "_w32") then {
     let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
-        def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
+        def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
-                            !cast<Instruction>(NAME # _threeaddr_w32)>;
-  } else if !eq(Suffix, "_w64") then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
-        def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+    if !eq(convertibleTo3Addr, 1) then {
+      let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+        let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+          def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+        }
       }
+      def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+                            !cast<Instruction>(NAME # _threeaddr # Suffix)>;
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
-                            !cast<Instruction>(NAME # _threeaddr_w64)>;
-  }
 
   if !eq(Type, WMMAOpSel) then {
     def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
@@ -906,21 +897,25 @@
 
 
 let WaveSizePredicate = isWave32 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
-  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_TIED_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp, 1>;
 }
 
 let WaveSizePredicate = isWave64 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
-  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_TIED_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
 
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -4,7 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -78,6 +80,55 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -4,7 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -70,6 +72,47 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
@@ -100,6 +143,47 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -4,7 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -78,6 +80,55 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -4,7 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -70,6 +72,47 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
@@ -100,6 +143,47 @@
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {