diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -855,11 +855,12 @@ defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2"; defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; + defvar isConvertableTo3Addr = !cond(!eq(Instr, "v_wmma_f16_16x16x16_f16"): 0, true: 1); defvar WMMAProfile = VOPProfileWMMA; if !eq(Suffix, "_w32") then { let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = isConvertableTo3Addr in { def _twoaddr_w32 : VOP3P_Pseudo; } let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { @@ -870,7 +871,7 @@ !cast(NAME # _threeaddr_w32)>; } else if !eq(Suffix, "_w64") then { let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = isConvertableTo3Addr in { def _twoaddr_w64 : VOP3P_Pseudo; } let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir --- a/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir +++ b/llvm/test/CodeGen/AMDGPU/twoaddr-wmma.mir @@ -1,8 +1,6 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 # RUN: llc -march=amdgcn -mcpu=gfx1100 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s -# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 -# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec - --- name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 registers: @@ -12,14 +10,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w32 + ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32 -# GCN: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec --- name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32 @@ -30,14 +30,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w32 + ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_256 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32 -# GCN: early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec --- name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32 @@ -48,14 +50,17 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w32 + ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY [[DEF]] + ; GCN-NEXT: early-clobber [[COPY]]:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[COPY]], 0, 0, 0, 0, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_256 = V_WMMA_F16_16X16X16_F16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32 -# GCN: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec --- name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32 @@ -66,14 +71,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w32 + ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_256 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w32 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_256, 0, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32 -# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec --- name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32 @@ -84,14 +91,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w32 + ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_256, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32 -# GCN: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec --- name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32 @@ -102,14 +111,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w32 + ; GCN: [[DEF:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_threeaddr_w32 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_256 = V_WMMA_I32_16X16X16_IU4_twoaddr_w32 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_256, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64 -# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec --- name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64 @@ -120,14 +131,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_f16_twoaddr_w64 + ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64 -# GCN: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, implicit $exec --- name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64 @@ -138,14 +151,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_f32_16x16x16_bf16_twoaddr_w64 + ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_128 = V_WMMA_F32_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64 -# GCN: early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec --- name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64 @@ -156,14 +171,17 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_f16_16x16x16_f16_twoaddr_w64 + ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[DEF]] + ; GCN-NEXT: early-clobber [[COPY]]:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[COPY]], 0, 0, 0, 0, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_128 = V_WMMA_F16_16X16X16_F16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64 -# GCN: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, 0, implicit $exec --- name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64 @@ -174,14 +192,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_bf16_16x16x16_bf16_twoaddr_w64 + ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_256 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_128 = V_WMMA_BF16_16X16X16_BF16_twoaddr_w64 8, killed %1:vreg_256, 8, killed %1:vreg_256, 8, %0:vreg_128, 0, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64 -# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec --- name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64 @@ -192,14 +212,16 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu8_twoaddr_w64 + ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU8_twoaddr_w64 8, killed %1:vreg_128, 8, killed %1:vreg_128, 8, %0:vreg_128, 0, 0, 0, implicit $exec ... -# GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64 -# GCN: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed %1, 8, killed %1, 8, %0, 0, 0, 0, implicit $exec --- name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64 @@ -210,6 +232,10 @@ body: | bb.0: + ; GCN-LABEL: name: test_v_wmma_i32_16x16x16_iu4_twoaddr_w64 + ; GCN: [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; GCN-NEXT: early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_threeaddr_w64 8, killed [[DEF1]], 8, killed [[DEF1]], 8, [[DEF]], 0, 0, 0, implicit $exec, implicit $exec %0 = IMPLICIT_DEF %1 = IMPLICIT_DEF early-clobber %2:vreg_128 = V_WMMA_I32_16X16X16_IU4_twoaddr_w64 8, killed %1:vreg_64, 8, killed %1:vreg_64, 8, %0:vreg_128, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll @@ -71,8 +71,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W32: ; %bb.0: ; %bb -; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] +; W32-NEXT: v_dual_mov_b32 v35, v23 :: v_dual_mov_b32 v34, v22 +; W32-NEXT: v_dual_mov_b32 v33, v21 :: v_dual_mov_b32 v32, v20 +; W32-NEXT: v_dual_mov_b32 v31, v19 :: v_dual_mov_b32 v30, v18 +; W32-NEXT: v_dual_mov_b32 v29, v17 :: v_dual_mov_b32 v28, v16 ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[28:35] ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off @@ -93,8 +98,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W32: ; %bb.0: ; %bb -; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: v_dual_mov_b32 v35, v23 :: v_dual_mov_b32 v34, v22 +; W32-NEXT: v_dual_mov_b32 v33, v21 :: v_dual_mov_b32 v32, v20 +; W32-NEXT: v_dual_mov_b32 v31, v19 :: v_dual_mov_b32 v30, v18 +; W32-NEXT: v_dual_mov_b32 v29, v17 :: v_dual_mov_b32 v28, v16 ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[8:15], v[8:15], v[16:23] op_sel:[0,0,1] +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[28:35] op_sel:[0,0,1] ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[32:35], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[28:31], off diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll @@ -63,8 +63,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W64: ; %bb.0: ; %bb -; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] +; W64-NEXT: v_mov_b32_e32 v27, v19 +; W64-NEXT: v_mov_b32_e32 v26, v18 +; W64-NEXT: v_mov_b32_e32 v25, v17 +; W64-NEXT: v_mov_b32_e32 v24, v16 ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] +; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[24:27] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_nop 0 @@ -81,8 +86,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out, ptr addrspace(1) %out2) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W64: ; %bb.0: ; %bb -; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: v_mov_b32_e32 v27, v19 +; W64-NEXT: v_mov_b32_e32 v26, v18 +; W64-NEXT: v_mov_b32_e32 v25, v17 +; W64-NEXT: v_mov_b32_e32 v24, v16 ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] +; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[24:27] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off ; W64-NEXT: s_nop 0