diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2364,6 +2364,8 @@ def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_tied_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_tied_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4252,6 +4252,8 @@ case Intrinsic::amdgcn_sudot8: case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f16_16x16x16_f16: + case Intrinsic::amdgcn_wmma_tied_bf16_16x16x16_bf16: + case Intrinsic::amdgcn_wmma_tied_f16_16x16x16_f16: case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16: case Intrinsic::amdgcn_wmma_f32_16x16x16_f16: case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4: diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -865,35 +865,26 @@ // it converts the default pseudo to the pseudo where src2 is not the same as vdst. // 3) @earlyclobber on the destination satisfies the constraint during RA. -multiclass WMMAInst { +multiclass WMMAInst { defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2"; defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; defvar WMMAProfile = VOPProfileWMMA; - if !eq(Suffix, "_w32") then { let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { - def _twoaddr_w32 : VOP3P_Pseudo; - } - let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { - def _threeaddr_w32 : VOP3P_Pseudo; + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in { + def _twoaddr # Suffix : VOP3P_Pseudo; } } - def : WMMAOpcodeMapping(NAME # _twoaddr_w32), - !cast(NAME # _threeaddr_w32)>; - } else if !eq(Suffix, "_w64") then { - let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in { - def _twoaddr_w64 : VOP3P_Pseudo; - } - let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { - def _threeaddr_w64 : VOP3P_Pseudo; + if !eq(convertibleTo3Addr, 1) then { + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in { + def _threeaddr # Suffix : VOP3P_Pseudo; + } } + def : WMMAOpcodeMapping(NAME # _twoaddr # Suffix), + !cast(NAME # _threeaddr # Suffix)>; } - def : WMMAOpcodeMapping(NAME # _twoaddr_w64), - !cast(NAME # _threeaddr_w64)>; - } if !eq(Type, WMMAOpSel) then { def : WMMAOpSelPat(NAME # _twoaddr # Suffix), node, P>; @@ -906,21 +897,25 @@ let WaveSizePredicate = isWave32 in { - defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; - defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_TIED_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>; } let WaveSizePredicate = isWave64 in { - defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; - defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>; + defm V_WMMA_TIED_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_tied_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_TIED_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_tied_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 0>; + defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>; + defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll @@ -4,7 +4,9 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) +declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) +declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) @@ -78,6 +80,55 @@ ret void } +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_untied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39] +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) + %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0) + store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_tied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38 +; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36 +; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34 +; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32 +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) + %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0) + store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) { @@ -112,6 +163,55 @@ ret void } +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39] +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) + %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0) + store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38 +; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36 +; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34 +; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32 +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) + %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0) + store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.i32.16x16x16.iu8 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll @@ -4,7 +4,9 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) +declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) @@ -70,6 +72,47 @@ ret void } +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_untied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35] +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) + %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0) + store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_tied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_mov_b32_e32 v43, v35 +; W64-NEXT: v_mov_b32_e32 v42, v34 +; W64-NEXT: v_mov_b32_e32 v41, v33 +; W64-NEXT: v_mov_b32_e32 v40, v32 +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) + %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0) + store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { @@ -100,6 +143,47 @@ ret void } +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35] +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) + %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0) + store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_mov_b32_e32 v43, v35 +; W64-NEXT: v_mov_b32_e32 v42, v34 +; W64-NEXT: v_mov_b32_e32 v41, v33 +; W64-NEXT: v_mov_b32_e32 v40, v32 +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) + %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0) + store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.i32.16x16x16.iu8 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll @@ -4,7 +4,9 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) +declare <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) +declare <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) @@ -78,6 +80,55 @@ ret void } +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_untied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39] +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) + %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0) + store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_f16_16x16x16_f16_tied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38 +; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36 +; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34 +; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32 +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W32-NEXT: v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0) + %res.1 = call <16 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0) + store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) { @@ -112,6 +163,55 @@ ret void } +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39] +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) + %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0) + store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied: +; W32: ; %bb.0: ; %bb +; W32-NEXT: v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38 +; W32-NEXT: v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36 +; W32-NEXT: v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34 +; W32-NEXT: v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32 +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39] +; W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51] +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[40:41], v[48:51], off offset:16 +; W32-NEXT: global_store_b128 v[40:41], v[44:47], off +; W32-NEXT: s_clause 0x1 +; W32-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; W32-NEXT: global_store_b128 v[42:43], v[32:35], off +; W32-NEXT: s_nop 0 +; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W32-NEXT: s_endpgm +bb: + %res.0 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0) + %res.1 = call <16 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0) + store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.i32.16x16x16.iu8 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll @@ -4,7 +4,9 @@ declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) +declare <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) @@ -70,6 +72,47 @@ ret void } +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_untied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35] +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) + %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0) + store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_f16_16x16x16_f16_tied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_mov_b32_e32 v43, v35 +; W64-NEXT: v_mov_b32_e32 v42, v34 +; W64-NEXT: v_mov_b32_e32 v41, v33 +; W64-NEXT: v_mov_b32_e32 v40, v32 +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W64-NEXT: v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0) + %res.1 = call <8 x half> @llvm.amdgcn.wmma.tied.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0) + store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) { @@ -100,6 +143,47 @@ ret void } +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35] +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) + %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0) + store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) { +; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied: +; W64: ; %bb.0: ; %bb +; W64-NEXT: v_mov_b32_e32 v43, v35 +; W64-NEXT: v_mov_b32_e32 v42, v34 +; W64-NEXT: v_mov_b32_e32 v41, v33 +; W64-NEXT: v_mov_b32_e32 v40, v32 +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35] +; W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43] +; W64-NEXT: global_store_b128 v[36:37], v[40:43], off +; W64-NEXT: global_store_b128 v[38:39], v[32:35], off +; W64-NEXT: s_nop 0 +; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; W64-NEXT: s_endpgm +bb: + %res.0 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0) + %res.1 = call <8 x i16> @llvm.amdgcn.wmma.tied.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0) + store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32 + store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32 + ret void +} + ; @llvm.amdgcn.wmma.i32.16x16x16.iu8 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {