diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -855,11 +855,12 @@
defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
+ defvar isConvertableTo3Addr = !cond(!eq(Instr, "v_wmma_f16_16x16x16_f16"): 0, true: 1);
defvar WMMAProfile = VOPProfileWMMA
;
if !eq(Suffix, "_w32") then {
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = isConvertableTo3Addr in {
def _twoaddr_w32 : VOP3P_Pseudo;
}
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
@@ -870,7 +871,7 @@
!cast(NAME # _threeaddr_w32)>;
} else if !eq(Suffix, "_w64") then {
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = isConvertableTo3Addr in {
def _twoaddr_w64 : VOP3P_Pseudo;
}
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir
@@ -1,8 +1,6 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
# RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
-# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
-# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
-
---
name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
registers:
@@ -12,14 +10,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32
+ ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
-# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
---
name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
@@ -30,14 +30,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32
+ ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
-# GCN: early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
@@ -48,14 +50,17 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32
+ ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF]]
+ ; GCN-NEXT: early-clobber [[COPY]]:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[COPY]], 0, 0, 0, 0, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
-# GCN: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
@@ -66,14 +71,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32
+ ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
-# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
@@ -84,14 +91,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32
+ ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_256, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
-# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
@@ -102,14 +111,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32
+ ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_twoaddr_w32 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_256, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
-# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
---
name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
@@ -120,14 +131,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64
+ ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
-# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec
---
name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
@@ -138,14 +151,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64
+ ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
-# GCN: early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
@@ -156,14 +171,17 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64
+ ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[DEF]]
+ ; GCN-NEXT: early-clobber [[COPY]]:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[COPY]], 0, 0, 0, 0, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
-# GCN: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
@@ -174,14 +192,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64
+ ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
-# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
@@ -192,14 +212,16 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64
+ ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_twoaddr_w64 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_128, 0, 0, 0, implicit $exec
...
-# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
-# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec
---
name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
@@ -210,6 +232,10 @@
body: |
bb.0:
+ ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64
+ ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec
%0 = IMPLICIT_DEF
%1 = IMPLICIT_DEF
early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_twoaddr_w64 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_128, 0, 0, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -71,8 +71,13 @@
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
; W32: ; %bb.0: ; %bb
-; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
+; W32-NEXT: v_dual_mov_b32 v35, v23 :: v_dual_mov_b32 v34, v22
+; W32-NEXT: v_dual_mov_b32 v33, v21 :: v_dual_mov_b32 v32, v20
+; W32-NEXT: v_dual_mov_b32 v31, v19 :: v_dual_mov_b32 v30, v18
+; W32-NEXT: v_dual_mov_b32 v29, v17 :: v_dual_mov_b32 v28, v16
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23]
+; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[28:35]
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
@@ -93,8 +98,13 @@
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
; W32: ; %bb.0: ; %bb
-; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
+; W32-NEXT: v_dual_mov_b32 v35, v23 :: v_dual_mov_b32 v34, v22
+; W32-NEXT: v_dual_mov_b32 v33, v21 :: v_dual_mov_b32 v32, v20
+; W32-NEXT: v_dual_mov_b32 v31, v19 :: v_dual_mov_b32 v30, v18
+; W32-NEXT: v_dual_mov_b32 v29, v17 :: v_dual_mov_b32 v28, v16
; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1]
+; W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[28:35] op_sel:[0,0,1]
; W32-NEXT: s_clause 0x1
; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16
; W32-NEXT: global_store_b128 v[24:25], v[28:31], off
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -63,8 +63,13 @@
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
; W64: ; %bb.0: ; %bb
-; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
+; W64-NEXT: v_mov_b32_e32 v27, v19
+; W64-NEXT: v_mov_b32_e32 v26, v18
+; W64-NEXT: v_mov_b32_e32 v25, v17
+; W64-NEXT: v_mov_b32_e32 v24, v16
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19]
+; W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[24:27]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_nop 0
@@ -81,8 +86,13 @@
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) {
; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
; W64: ; %bb.0: ; %bb
-; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
+; W64-NEXT: v_mov_b32_e32 v27, v19
+; W64-NEXT: v_mov_b32_e32 v26, v18
+; W64-NEXT: v_mov_b32_e32 v25, v17
+; W64-NEXT: v_mov_b32_e32 v24, v16
; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1]
+; W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[24:27] op_sel:[0,0,1]
; W64-NEXT: global_store_b128 v[20:21], v[24:27], off
; W64-NEXT: global_store_b128 v[22:23], v[16:19], off
; W64-NEXT: s_nop 0