diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -305,5 +305,10 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts") TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, "V4iWiWiV4iIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", "nc", "mai-insts") +TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", "nc", "mai-insts") + #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -1,9 +1,11 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 #pragma OPENCL EXTENSION cl_khr_fp64:enable +typedef float v2f __attribute__((ext_vector_type(2))); typedef float v4f __attribute__((ext_vector_type(4))); typedef float v16f __attribute__((ext_vector_type(16))); typedef float v32f __attribute__((ext_vector_type(32))); @@ -216,3 +218,33 @@ } #endif // MFMA_GFX90A_TESTS + +#ifdef MFMA_GFX940_TESTS +// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8 +// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) +void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c) +{ + *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8 +// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) +void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c) +{ + *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32 +// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c) +{ + *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0); +} + +// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32 +// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) +void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c) +{ + *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0); +} +#endif // MFMA_GFX940_TESTS diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl new file mode 100644 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s + +typedef float v2f __attribute__((ext_vector_type(2))); +typedef float v4f __attribute__((ext_vector_type(4))); +typedef float v16f __attribute__((ext_vector_type(16))); +typedef int v4i __attribute__((ext_vector_type(4))); +typedef int v16i __attribute__((ext_vector_type(16))); + +void test_mfma_i32_16x16x32i8(global v4i* out, long a, long b, v4i c, int d) +{ + *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}} + *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}} + *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}} +} + +void test_mfma_i32_32x32x16i8(global v16i* out, long a, long b, v16i c, int d) +{ + *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}} + *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}} + *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}} +} + +void test_mfma_f32_16x16x8xf32(global v4f* out, v2f a, v2f b, v4f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}} +} + +void test_mfma_f32_32x32x4xf32(global v16f* out, v2f a, v2f b, v16f c, int d) +{ + *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}} + *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1999,6 +1999,11 @@ [IntrArgMemOnly, IntrWillReturn, NoCapture>]>, GCCBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; +def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4243,7 +4243,11 @@ case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: case Intrinsic::amdgcn_mfma_f64_16x16x4f64: - case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { + case Intrinsic::amdgcn_mfma_f64_4x4x4f64: + case Intrinsic::amdgcn_mfma_i32_16x16x32_i8: + case Intrinsic::amdgcn_mfma_i32_32x32x16_i8: + case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32: + case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: { // Default for MAI intrinsics. // srcC can also be an immediate which can be folded later. // FIXME: Should we eventually add an alternative mapping with AGPR src diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -295,6 +295,10 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective, // but the mask results are uniform. These produce a divergent and diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2407,6 +2407,11 @@ def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>; def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>; +def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>; +def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>; +def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>; +def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>; + class Commutable_REV { string RevOp = revOp; bit IsOrig = isOrig; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -264,10 +264,14 @@ def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>; def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>; def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>; +def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>; +def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>; def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>; def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -412,6 +412,10 @@ def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI; def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI; def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI; +def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI; +def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI; +def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI; +def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI; def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI; def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI; @@ -430,6 +434,10 @@ def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI; def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI; def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI; +def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI; +def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI; class MFMATable { bit IsMac = is_mac; @@ -527,6 +535,13 @@ defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>; } // End Predicates = [isGFX90APlus] +let Predicates = [isGFX940Plus] in { + defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>; + defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>; + defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>; + defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>; +} // End Predicates = [isGFX940Plus] + let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile, any_fma>; defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile, any_fmul>; @@ -727,6 +742,11 @@ defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; +defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">; +defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">; +defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">; +defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">; + defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">; defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">; defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx940.mir @@ -0,0 +1,119 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx940 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST +# RUN: llc -march=amdgcn -mcpu=gfx940 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY + +--- +name: mfma_i32_16x16x32_i8_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + + ; FAST-LABEL: name: mfma_i32_16x16x32_i8_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x32.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: mfma_i32_16x16x32_i8_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x32.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x32.i8), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: mfma_i32_32x32x16_i8_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + + ; FAST-LABEL: name: mfma_i32_32x32x16_i8_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x16.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + ; GREEDY-LABEL: name: mfma_i32_32x32x16_i8_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x16.i8), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x16.i8), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: mfma_f32_16x16x8_xf32_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + + ; FAST-LABEL: name: mfma_f32_16x16x8_xf32_vva + ; FAST: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; FAST: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + ; GREEDY-LABEL: name: mfma_f32_16x16x8_xf32_vva + ; GREEDY: liveins: $vgpr1_vgpr2, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<4 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3 + %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8.xf32), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: mfma_f32_32x32x4_xf32_vva +legalized: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + + ; FAST-LABEL: name: mfma_f32_32x32x4_xf32_vva + ; FAST: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; FAST: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; FAST: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; FAST: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0 + ; FAST: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + ; GREEDY-LABEL: name: mfma_f32_32x32x4_xf32_vva + ; GREEDY: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GREEDY: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr2_vgpr3 + ; GREEDY: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GREEDY: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), [[COPY]](s64), [[COPY1]](s64), [[COPY2]](<16 x s32>), 0, 0, 0 + ; GREEDY: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4.xf32), %0, %1, %2, 0, 0, 0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx940.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -stress-regalloc=10 -global-isel -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s + +declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: +; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3 +; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4 +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} +; GFX940: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GISEL: v_mfma_i32_16x16x32_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_i32_16x16x32i8(<4 x i32> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8: +; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1 +; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2 +; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 3 +; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4 +; GCN-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} +; GFX940: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[TWO]]:[[ONE]]], v{{\[}}[[FOUR]]:[[THREE]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GISEL: v_mfma_i32_32x32x16_i8 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_i32_32x32x16i8(<16 x i32> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 1, i32 2, i32 3) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32: +; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0 +; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0 +; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000 +; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0 +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} +; GFX940: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:[[TWO]]], v{{\[}}[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GISEL: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(<4 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> , <2 x float> , <4 x float> %in.1, i32 1, i32 2, i32 3) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32: +; GFX940-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0 +; GFX940-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0 +; GFX940-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000 +; GFX940-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0 +; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} +; GFX940: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:[[TWO]]], v{{\[}}[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GISEL: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3 +; GCN-NOT: v_accvgpr_read_b32 +; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] +define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(<16 x float> addrspace(1)* %arg) #0 { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> , <2 x float> , <16 x float> %in.1, i32 1, i32 2, i32 3) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mfma-vgpr-cd-select-gfx940.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s + +declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) +declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) +declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) + +; GCN-LABEL: {{^}}test_mfma_i32_16x16x32i8: +; GCN: v_mfma_i32_16x16x32_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +define amdgpu_kernel void @test_mfma_i32_16x16x32i8(<4 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %arg + %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 0, i32 0, i32 0) + store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_i32_32x32x16i8: +; GCN: v_mfma_i32_32x32x16_i8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +define amdgpu_kernel void @test_mfma_i32_32x32x16i8(<16 x i32> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x i32>, <16 x i32> addrspace(1)* %arg + %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 4294967298, i64 12884901892, <16 x i32> %in.1, i32 0, i32 0, i32 0) + store <16 x i32> %mai.1, <16 x i32> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32: +; GCN: v_mfma_f32_16x16x8_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(<4 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg + %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> , <2 x float> , <4 x float> %in.1, i32 0, i32 0, i32 0) + store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32: +; GCN: v_mfma_f32_32x32x4_xf32 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] +define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(<16 x float> addrspace(1)* %arg) { +bb: + %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg + %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> , <2 x float> , <16 x float> %in.1, i32 0, i32 0, i32 0) + store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg + ret void +} diff --git a/llvm/test/MC/AMDGPU/mai-gfx940.s b/llvm/test/MC/AMDGPU/mai-gfx940.s --- a/llvm/test/MC/AMDGPU/mai-gfx940.s +++ b/llvm/test/MC/AMDGPU/mai-gfx940.s @@ -262,6 +262,54 @@ v_mfma_f32_32x32x1f32 v[0:31], v0, v1, v[34:65] blgp:7 // GFX940: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, v[34:65] blgp:7 ; encoding: [0x00,0x00,0xc0,0xd3,0x00,0x03,0x8a,0xe4] +v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] +// GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] +// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] +// GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] +// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 +// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_32x32x16i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 +// GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_32x32x16i8 v[0:15], v[2:3], v[4:5], v[0:15] blgp:5 +// GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] blgp:5 ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0xa4] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] +// GFX940: v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] +// GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 +// GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_16x16x32i8 v[0:3], v[2:3], v[4:5], v[0:3] blgp:5 +// GFX940: v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] blgp:5 ; encoding: [0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0xa4] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_i32_16x16x32i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 +// GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4] +// GFX90A: error: instruction not supported on this GPU + v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[2:3], v[4:5], v[34:65] // GFX940: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[2:3], v[4:5], v[34:65] ; encoding: [0x00,0x00,0xdd,0xd3,0x02,0x09,0x8a,0x04] // GFX90A: error: instruction not supported on this GPU @@ -379,3 +427,35 @@ v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[4:5], a[2:5] // GFX940: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xe1,0xd3,0x02,0x09,0x0a,0x04] + +v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// GFX940: v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] +// GFX940: v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] ; encoding: [0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8xf32 a[0:3], v[2:3], v[4:5], a[2:5] +// GFX940: v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_16x16x8xf32 v[0:3], v[2:3], v[4:5], v[2:5] +// GFX940: v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] ; encoding: [0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[18:33] +// GFX940: v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[18:33] ; encoding: [0x00,0x00,0xbf,0xd3,0x02,0x09,0x4a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] +// GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x4a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4xf32 v[0:15], v[2:3], v[4:5], v[18:33] +// GFX940: v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[18:33] ; encoding: [0x00,0x00,0xbf,0xd3,0x02,0x09,0x4a,0x04] +// GFX90A: error: instruction not supported on this GPU + +v_mfma_f32_32x32x4xf32 a[0:15], v[2:3], v[4:5], a[18:33] +// GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[18:33] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x4a,0x04] +// GFX90A: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt --- a/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/mai-gfx940.txt @@ -3,6 +3,24 @@ # GFX940: v_accvgpr_write_b32 a10, s20 ; encoding: [0x0a,0x40,0xd9,0xd3,0x14,0x00,0x00,0x18] 0x0a,0x40,0xd9,0xd3,0x14,0x00,0x00,0x18 +# GFX940: v_mfma_i32_32x32x16_i8 v[0:15], v[2:3], v[4:5], v[0:15] ; encoding: [0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04] +0x00,0x00,0xd6,0xd3,0x02,0x09,0x02,0x04 + +# GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04] +0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0x04 + +# GFX940: v_mfma_i32_32x32x16_i8 a[0:15], v[2:3], v[4:5], a[0:15] blgp:5 ; encoding: [0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4] +0x00,0x80,0xd6,0xd3,0x02,0x09,0x02,0xa4 + +# GFX940: v_mfma_i32_16x16x32_i8 v[0:3], v[2:3], v[4:5], v[0:3] ; encoding: [0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0x04] +0x00,0x00,0xd7,0xd3,0x02,0x09,0x02,0x04 + +# GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0x04] +0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0x04 + +# GFX940: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] blgp:5 ; encoding: [0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4] +0x00,0x80,0xd7,0xd3,0x02,0x09,0x02,0xa4 + # GFX940: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[2:3], v[4:5], v[2:33] ; encoding: [0x00,0x00,0xdd,0xd3,0x02,0x09,0x0a,0x04] 0x00,0x00,0xdd,0xd3,0x02,0x09,0x0a,0x04 @@ -32,3 +50,15 @@ # GFX940: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xe1,0xd3,0x02,0x09,0x0a,0x04] 0x00,0x80,0xe1,0xd3,0x02,0x09,0x0a,0x04 + +# GFX940: v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5] ; encoding: [0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04] +0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04 + +# GFX940: v_mfma_f32_16x16x8_xf32 v[0:3], v[2:3], v[4:5], v[2:5] ; encoding: [0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04] +0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04 + +# GFX940: v_mfma_f32_32x32x4_xf32 v[0:15], v[2:3], v[4:5], v[2:17] ; encoding: [0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04] +0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04 + +# GFX940: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[4:5], a[2:17] ; encoding: [0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04] +0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04