diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2023,10 +2023,10 @@ [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>] >; -def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; -def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL; +def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU; def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -329,13 +329,14 @@ //===----------------------------------------------------------------------===// // Returns 1 if the source arguments have modifiers, 0 if they do not. -// XXX - do f16 instructions? class isFloatType { bit ret = !or(!eq(SrcVT.Value, f16.Value), !eq(SrcVT.Value, f32.Value), !eq(SrcVT.Value, f64.Value), !eq(SrcVT.Value, v2f16.Value), !eq(SrcVT.Value, v4f16.Value), + !eq(SrcVT.Value, v8f16.Value), + !eq(SrcVT.Value, v16f16.Value), !eq(SrcVT.Value, v2f32.Value), !eq(SrcVT.Value, v4f32.Value), !eq(SrcVT.Value, v8f32.Value), @@ -343,10 +344,14 @@ !eq(SrcVT.Value, v4f64.Value)); } +// XXX - do v2i16 instructions? class isIntType { bit ret = !or(!eq(SrcVT.Value, i16.Value), !eq(SrcVT.Value, i32.Value), !eq(SrcVT.Value, i64.Value), + !eq(SrcVT.Value, v4i16.Value), + !eq(SrcVT.Value, v8i16.Value), + !eq(SrcVT.Value, v16i16.Value), !eq(SrcVT.Value, v2i32.Value), !eq(SrcVT.Value, v4i32.Value), !eq(SrcVT.Value, v8i32.Value)); @@ -1733,10 +1738,16 @@ !eq(SrcVT.Value, v2i16.Value), !eq(SrcVT.Value, v2f32.Value), !eq(SrcVT.Value, v2i32.Value), + !eq(SrcVT.Value, v4f16.Value), + !eq(SrcVT.Value, v4i16.Value), !eq(SrcVT.Value, v4f32.Value), !eq(SrcVT.Value, v4i32.Value), + !eq(SrcVT.Value, v8f16.Value), + !eq(SrcVT.Value, v8i16.Value), !eq(SrcVT.Value, v8f32.Value), - !eq(SrcVT.Value, v8i32.Value)); + !eq(SrcVT.Value, v8i32.Value), + !eq(SrcVT.Value, v16f16.Value), + !eq(SrcVT.Value, v16i16.Value)); } // Return type of input modifiers operand for specified input operand diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -684,20 +684,24 @@ let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32); let HasClamp = _HasClamp; let HasOpSel = _HasOpSel; + let IsPacked = 1; let IsWMMA = 1; } -def VOP_V8F32_V8F32_V8F32_V8F32 : VOPProfile <[v8f32, v8f32, v8f32, v8f32]>; -def VOP_V8F32_V8I32_V8I32_V8F32 : VOPProfile <[v8f32, v8i32, v8i32, v8f32]>; +def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>; +def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>; +def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>; +def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>; def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>; def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>; -def VOP_V8I32_V8I32_V8I32_V8I32 : VOPProfile <[v8i32, v8i32, v8i32, v8i32]>; -def VOP_V4F32_V8F32_V8F32_V4F32 : VOPProfile <[v4f32, v8f32, v8f32, v4f32]>; -def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>; +def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>; +def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>; +def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>; +def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>; def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>; def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>; -def VOP_V4I32_V8I32_V8I32_V4I32 : VOPProfile <[v4i32, v8i32, v8i32, v4i32]>; + class WMMAType val> { bit hasClamp = val{0}; @@ -812,22 +816,24 @@ } } + let WaveSizePredicate = isWave32 in { - defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V8I32_V8I32_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V8I32_V8I32_V8I32_V8I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; } let WaveSizePredicate = isWave64 in { - defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V8I32_V8I32_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; - defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; - defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V4I32_V8I32_V8I32_V4I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>; + defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>; + defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>; defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>; defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>; + } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>) -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>) -declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg) -declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) ; @llvm.amdgcn.wmma.f32.16x16x16.f16 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f32_16x16x16_f16: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -20,14 +20,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 ret void } ; @llvm.amdgcn.wmma.f32.16x16x16.bf16 -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f32_16x16x16_bf16: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -37,14 +37,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 ret void } ; @llvm.amdgcn.wmma.f16.16x16x16.f16 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -54,12 +54,12 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0) - store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) + store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] @@ -69,14 +69,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1) - store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) + store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 ret void } ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) { ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -86,12 +86,12 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0) - store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) + store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 ret void } -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) { ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] @@ -101,8 +101,8 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1) - store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) + store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 -declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>) -declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>) -declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg) -declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) ; @llvm.amdgcn.wmma.f32.16x16x16.f16 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f32_16x16x16_f16: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -18,14 +18,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 ret void } ; @llvm.amdgcn.wmma.f32.16x16x16.bf16 -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f32_16x16x16_bf16: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -33,14 +33,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 ret void } ; @llvm.amdgcn.wmma.f16.16x16x16.f16 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -48,12 +48,12 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0) - store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] @@ -61,14 +61,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1) - store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) + store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 ret void } ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) { ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -76,12 +76,12 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0) - store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 ret void } -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) { ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] @@ -89,8 +89,8 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1) - store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) + store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>) -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>) -declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg) -declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) ; @llvm.amdgcn.wmma.f32.16x16x16.f16 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f32_16x16x16_f16: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -20,14 +20,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 ret void } ; @llvm.amdgcn.wmma.f32.16x16x16.bf16 -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f32_16x16x16_bf16: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -37,14 +37,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 ret void } ; @llvm.amdgcn.wmma.f16.16x16x16.f16 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -54,12 +54,12 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0) - store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) + store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] @@ -69,14 +69,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1) - store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 + %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) + store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 ret void } ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) { ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] @@ -86,12 +86,12 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0) - store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) + store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 ret void } -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) { ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] @@ -101,8 +101,8 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1) - store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 + %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) + store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 -declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>) -declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>) -declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg) -declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) ; @llvm.amdgcn.wmma.f32.16x16x16.f16 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f32_16x16x16_f16: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -18,14 +18,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 ret void } ; @llvm.amdgcn.wmma.f32.16x16x16.bf16 -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f32_16x16x16_bf16: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -33,14 +33,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 ret void } ; @llvm.amdgcn.wmma.f16.16x16x16.f16 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -48,12 +48,12 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0) - store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] @@ -61,14 +61,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1) - store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) + store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 ret void } ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) { ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] @@ -76,12 +76,12 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0) - store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 ret void } -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) { ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] @@ -89,8 +89,8 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1) - store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) + store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32 -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>) -declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>) -declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg) -declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>) +declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) +declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) @@ -20,7 +20,7 @@ ; @llvm.amdgcn.wmma.f32.16x16x16.f16 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { ; W32-LABEL: test_wmma_f32_16x16x16_f16: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] @@ -34,8 +34,8 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C) - %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C) + %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x float> %C) store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 ret void @@ -43,7 +43,7 @@ ; @llvm.amdgcn.wmma.f32.16x16x16.bf16 -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { ; W32-LABEL: test_wmma_f32_16x16x16_bf16: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] @@ -57,8 +57,8 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C) - %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x float> %C) + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C) + %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x float> %C) store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 ret void @@ -66,7 +66,7 @@ ; @llvm.amdgcn.wmma.f16.16x16x16.f16 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] @@ -80,14 +80,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0) - %res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 0) - store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 - store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 + %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0) + %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 0) + store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 + store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32 ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) { ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] @@ -101,16 +101,16 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1) - %res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 1) - store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32 - store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32 + %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1) + %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 1) + store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32 + store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32 ret void } ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) { ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] @@ -124,14 +124,14 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0) - %res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 0) - store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 - store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0) + %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 0) + store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 + store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32 ret void } -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) { ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi: ; W32: ; %bb.0: ; %bb ; W32-NEXT: v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1] @@ -145,10 +145,10 @@ ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: - %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1) - %res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 1) - store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32 - store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32 + %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1) + %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 1) + store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32 + store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64 -declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>) -declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>) -declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg) -declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>) +declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg) +declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg) declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg) @@ -20,7 +20,7 @@ ; @llvm.amdgcn.wmma.f32.16x16x16.f16 -define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { ; W64-LABEL: test_wmma_f32_16x16x16_f16: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] @@ -30,8 +30,8 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C) - %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C) + %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <4 x float> %C) store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 ret void @@ -39,7 +39,7 @@ ; @llvm.amdgcn.wmma.f32.16x16x16.bf16 -define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { ; W64-LABEL: test_wmma_f32_16x16x16_bf16: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] @@ -49,8 +49,8 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C) - %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x float> %C) + %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C) + %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C) store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 ret void @@ -58,7 +58,7 @@ ; @llvm.amdgcn.wmma.f16.16x16x16.f16 -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] @@ -68,14 +68,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0) - %res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 0) - store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 - store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0) + %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0) + store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 + store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16 ret void } -define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) { ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] @@ -85,16 +85,16 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1) - %res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 1) - store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16 - store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16 + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1) + %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1) + store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16 + store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16 ret void } ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16 -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) { ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] @@ -104,14 +104,14 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0) - %res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 0) - store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 - store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0) + %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0) + store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 + store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16 ret void } -define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) { +define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) { ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi: ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] @@ -121,10 +121,10 @@ ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: - %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1) - %res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 1) - store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16 - store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16 + %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1) + %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1) + store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16 + store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16 ret void }