Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -317,6 +317,9 @@ X86 Support ^^^^^^^^^^^ +- Add ISA of ``AMX-COMPLEX`` which support ``_tile_cmmimfp16ps`` and + ``_tile_cmmrlfp16ps``. + Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ Index: clang/include/clang/Basic/BuiltinsX86_64.def =================================================================== --- clang/include/clang/Basic/BuiltinsX86_64.def +++ clang/include/clang/Basic/BuiltinsX86_64.def @@ -117,6 +117,8 @@ TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbf16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-bf16") TARGET_BUILTIN(__builtin_ia32_tdpfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-fp16") +TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-complex") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -134,6 +136,9 @@ TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16") TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") +TARGET_BUILTIN(__builtin_ia32_tcmmimfp16ps, "vIUcIUcIUc", "n", "amx-complex") +TARGET_BUILTIN(__builtin_ia32_tcmmrlfp16ps, "vIUcIUcIUc", "n", "amx-complex") + TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd") Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -4656,6 +4656,8 @@ def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group; def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group; def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group; +def mamx_complex : Flag<["-"], "mamx-complex">, Group; +def mno_amx_complex : Flag<["-"], "mno-amx-complex">, Group; def mamx_fp16 : Flag<["-"], "mamx-fp16">, Group; def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group; def mamx_int8 : Flag<["-"], "mamx-int8">, Group; Index: clang/lib/Basic/Targets/X86.h =================================================================== --- clang/lib/Basic/Targets/X86.h +++ clang/lib/Basic/Targets/X86.h @@ -154,6 +154,7 @@ bool HasAMXTILE = false; bool HasAMXINT8 = false; bool HasAMXBF16 = false; + bool HasAMXCOMPLEX = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUINTR = false; Index: clang/lib/Basic/Targets/X86.cpp =================================================================== --- clang/lib/Basic/Targets/X86.cpp +++ clang/lib/Basic/Targets/X86.cpp @@ -335,6 +335,8 @@ HasAMXINT8 = true; } else if (Feature == "+amx-tile") { HasAMXTILE = true; + } else if (Feature == "+amx-complex") { + HasAMXCOMPLEX = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -799,6 +801,8 @@ Builder.defineMacro("__AMX_BF16__"); if (HasAMXFP16) Builder.defineMacro("__AMX_FP16__"); + if (HasAMXCOMPLEX) + Builder.defineMacro("__AMXCOMPLEX__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -915,6 +919,7 @@ .Case("amx-fp16", true) .Case("amx-int8", true) .Case("amx-tile", true) + .Case("amx-complex", true) .Case("avx", true) .Case("avx2", true) .Case("avx512f", true) @@ -1016,6 +1021,7 @@ .Case("amx-fp16", HasAMXFP16) .Case("amx-int8", HasAMXINT8) .Case("amx-tile", HasAMXTILE) + .Case("amx-complex", HasAMXCOMPLEX) .Case("avx", SSELevel >= AVX) .Case("avx2", SSELevel >= AVX2) .Case("avx512f", SSELevel >= AVX512F) Index: clang/lib/Headers/CMakeLists.txt =================================================================== --- clang/lib/Headers/CMakeLists.txt +++ clang/lib/Headers/CMakeLists.txt @@ -117,6 +117,7 @@ # Intrinsics adxintrin.h ammintrin.h + amxcomplexintrin.h amxfp16intrin.h amxintrin.h avx2intrin.h Index: clang/lib/Headers/amxcomplexintrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/amxcomplexintrin.h @@ -0,0 +1,201 @@ +/*===------------- amxcomplexintrin.h - AMXCOMPLEX --------------------------=== + */ +/* INTEL_CUSTOMIZATION */ +/* + * INTEL CONFIDENTIAL + * + * Modifications, Copyright (C) 2023 Intel Corporation + * + * This software and the related documents are Intel copyrighted materials, and + * your use of them is governed by the express license under which they were + * provided to you ("License"). Unless the License provides otherwise, you may + * not use, modify, copy, publish, distribute, disclose or transmit this + * software or the related documents without Intel's prior written permission. + * + * This software and the related documents are provided as is, with no express + * or implied warranties, other than those that are expressly stated in the + * License. + */ +/* end INTEL_CUSTOMIZATION */ +/* + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_COMPLEXINTRIN_H +#define __AMX_COMPLEXINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_COMPLEX \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-complex"))) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the imaginary part of the result. For each possible combination +/// of (row of \a a, column of \a b), it performs a set of multiplication +/// and accumulations on all corresponding complex numbers (one from \a a +/// and one from \a b). The imaginary part of the \a a element is multiplied +/// with the real part of the corresponding \a b element, and the real part +/// of the \a a element is multiplied with the imaginary part of the +/// corresponding \a b elements. The two accumulated results are added, and +/// then accumulated into the corresponding row and column of \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1]) +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b) + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles \a a and \a b is interpreted as a complex number +/// with FP16 real part and FP16 imaginary part. +/// Calculates the real part of the result. For each possible combination +/// of (row of \a a, column of \a b), it performs a set of multiplication +/// and accumulations on all corresponding complex numbers (one from \a a +/// and one from \a b). The real part of the \a a element is multiplied +/// with the real part of the corresponding \a b element, and the negated +/// imaginary part of the \a a element is multiplied with the imaginary +/// part of the corresponding \a b elements. The two accumulated results +/// are added, and then accumulated into the corresponding row and column +/// of \a dst. +/// +/// \headerfile +/// +/// \code +/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b); +/// \endcode +/// +/// \code{.operation} +/// FOR m := 0 TO dst.rows - 1 +/// tmp := dst.row[m] +/// FOR k := 0 TO (a.colsb / 4) - 1 +/// FOR n := 0 TO (dst.colsb / 4) - 1 +/// tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0]) +/// tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1]) +/// ENDFOR +/// ENDFOR +/// write_row_and_zero(dst, m, tmp, dst.colsb) +/// ENDFOR +/// zero_upper_rows(dst, dst.rows) +/// zero_tileconfig_start() +/// \endcode +/// +/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX +_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2); +} + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX +_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number with +/// FP16 real part and FP16 imaginary part. +/// This function calculates the imaginary part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCMMIMFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_COMPLEX +static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +/// Perform matrix multiplication of two tiles containing complex elements and +/// accumulate the results into a packed single precision tile. Each dword +/// element in input tiles src0 and src1 is interpreted as a complex number with +/// FP16 real part and FP16 imaginary part. +/// This function calculates the real part of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TCMMRLFP16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_COMPLEX +static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +#endif // __x86_64__ +#endif // __AMX_COMPLEXINTRIN_H Index: clang/lib/Headers/immintrin.h =================================================================== --- clang/lib/Headers/immintrin.h +++ clang/lib/Headers/immintrin.h @@ -538,6 +538,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AMXCOMPLEX__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include Index: clang/lib/Sema/SemaChecking.cpp =================================================================== --- clang/lib/Sema/SemaChecking.cpp +++ clang/lib/Sema/SemaChecking.cpp @@ -5237,6 +5237,8 @@ case X86::BI__builtin_ia32_tdpbuud: case X86::BI__builtin_ia32_tdpbf16ps: case X86::BI__builtin_ia32_tdpfp16ps: + case X86::BI__builtin_ia32_tcmmimfp16ps: + case X86::BI__builtin_ia32_tcmmrlfp16ps: return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); } } Index: clang/test/CodeGen/X86/amx_complex_api.c =================================================================== --- /dev/null +++ clang/test/CodeGen/X86/amx_complex_api.c @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-bf16 \ +// RUN: -target-feature +amx-complex \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK + +#include + +char buf[1024]; +#define STRIDE 32 + +char buf2[1024]; + +void test_tile_cmmimfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_cmmimfp16ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.tcmmimfp16ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_cmmimfp16ps(&c, a, b); +} + +void test_tile_cmmrlfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_cmmrlfp16ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.tcmmrlfp16ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_cmmrlfp16ps(&c, a, b); +} Index: clang/test/CodeGen/X86/amxcomplex-builtins.c =================================================================== --- /dev/null +++ clang/test/CodeGen/X86/amxcomplex-builtins.c @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-complex \ +// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s + +#include +#include +void test_tile_cmmimfp16ps(void) { + // CHECK-LABEL: @test_tile_cmmimfp16ps + // CHECK: call void @llvm.x86.tcmmimfp16ps(i8 1, i8 2, i8 3) + _tile_cmmimfp16ps(1, 2, 3); +} + +void test_tile_cmmrlfp16ps(void) { + // CHECK-LABEL: @test_tile_cmmrlfp16ps + // CHECK: call void @llvm.x86.tcmmrlfp16ps(i8 1, i8 2, i8 3) + _tile_cmmrlfp16ps(1, 2, 3); +} Index: clang/test/CodeGen/X86/amxcomplex-errors.c =================================================================== --- /dev/null +++ clang/test/CodeGen/X86/amxcomplex-errors.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-complex -emit-llvm -fsyntax-only -verify + +#include +#include +void test_tile_cmmimfp16ps() { + _tile_cmmimfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_cmmimfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} + _tile_cmmimfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} + _tile_cmmimfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} +} + +void test_tile_cmmrlfp16ps() { + _tile_cmmrlfp16ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_cmmrlfp16ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} + _tile_cmmrlfp16ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} + _tile_cmmrlfp16ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} +} Index: clang/test/Driver/x86-target-features.c =================================================================== --- clang/test/Driver/x86-target-features.c +++ clang/test/Driver/x86-target-features.c @@ -297,6 +297,13 @@ // AMX-FP16: "-target-feature" "+amx-fp16" // NO-AMX-FP16: "-target-feature" "-amx-fp16" +// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-complex %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-COMPLEX %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-complex %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-COMPLEX %s +// AMX-COMPLEX: "-target-feature" "+amx-complex" +// NO-AMX-COMPLEX: "-target-feature" "-amx-complex" + // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s // HRESET: "-target-feature" "+hreset" Index: clang/test/Preprocessor/x86_target_features.c =================================================================== --- clang/test/Preprocessor/x86_target_features.c +++ clang/test/Preprocessor/x86_target_features.c @@ -559,6 +559,18 @@ // NO-AMX-FP16-NOT: #define __AMX_FP16__ 1 // NO-AMX-FP16-NOT: #define __AMX_TILE__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-complex -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-COMPLEX %s + +// AMX-COMPLEX: #define __AMXCOMPLEX__ 1 + +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-complex -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-COMPLEX %s +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-complex -mno-amx-tile \ +// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-COMPLEX %s + +// NO-AMX-COMPLEX-NOT: #define __AMXCOMPLEX__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s // AVXVNNI: #define __AVX2__ 1 Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -5352,6 +5352,14 @@ Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; + // AMX-COMPLEX + def int_x86_tcmmimfp16ps : ClangBuiltin<"__builtin_ia32_tcmmimfp16ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>, ImmArg>]>; + def int_x86_tcmmrlfp16ps : ClangBuiltin<"__builtin_ia32_tcmmrlfp16ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>, ImmArg>]>; + // AMX - internal intrinsics def int_x86_ldtilecfg_internal : ClangBuiltin<"__builtin_ia32_tile_loadconfig_internal">, @@ -5414,6 +5422,19 @@ DefaultAttrsIntrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>; def int_x86_cast_tile_to_vector: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>; + + def int_x86_tcmmimfp16ps_internal : + ClangBuiltin<"__builtin_ia32_tcmmimfp16ps_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; + def int_x86_tcmmrlfp16ps_internal : + ClangBuiltin<"__builtin_ia32_tcmmrlfp16ps_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, + llvm_x86amx_ty, llvm_x86amx_ty, + llvm_x86amx_ty], []>; } //===----------------------------------------------------------------------===// Index: llvm/include/llvm/TargetParser/X86TargetParser.def =================================================================== --- llvm/include/llvm/TargetParser/X86TargetParser.def +++ llvm/include/llvm/TargetParser/X86TargetParser.def @@ -167,6 +167,7 @@ X86_FEATURE (64BIT, "64bit") X86_FEATURE (ADX, "adx") X86_FEATURE (AMX_BF16, "amx-bf16") +X86_FEATURE (AMX_COMPLEX, "amx-complex") X86_FEATURE (AMX_INT8, "amx-int8") X86_FEATURE (AMX_TILE, "amx-tile") X86_FEATURE (CLDEMOTE, "cldemote") Index: llvm/lib/Target/X86/X86.td =================================================================== --- llvm/lib/Target/X86/X86.td +++ llvm/lib/Target/X86/X86.td @@ -267,6 +267,9 @@ def FeatureAMXFP16 : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true", "Support AMX amx-fp16 instructions", [FeatureAMXTILE]>; +def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true", + "Support AMX-COMPLEX instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", Index: llvm/lib/Target/X86/X86ExpandPseudo.cpp =================================================================== --- llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -562,6 +562,8 @@ MI.setDesc(TII->get(Opc)); return true; } + case X86::PTCMMIMFP16PSV: + case X86::PTCMMRLFP16PSV: case X86::PTDPBSSDV: case X86::PTDPBSUDV: case X86::PTDPBUSDV: @@ -573,6 +575,8 @@ MI.removeOperand(i); unsigned Opc; switch (Opcode) { + case X86::PTCMMIMFP16PSV: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PSV: Opc = X86::TCMMRLFP16PS; break; case X86::PTDPBSSDV: Opc = X86::TDPBSSD; break; case X86::PTDPBSUDV: Opc = X86::TDPBSUD; break; case X86::PTDPBUSDV: Opc = X86::TDPBUSD; break; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38164,6 +38164,23 @@ MI.eraseFromParent(); // The pseudo is gone now. return BB; } + case X86::PTCMMIMFP16PS: + case X86::PTCMMRLFP16PS: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unexpected instruction!"); + case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; + case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; + } + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } Index: llvm/lib/Target/X86/X86InstrAMX.td =================================================================== --- llvm/lib/Target/X86/X86InstrAMX.td +++ llvm/lib/Target/X86/X86InstrAMX.td @@ -215,3 +215,45 @@ } } } // HasAMXTILE, HasAMXFP16 + +let Predicates = [HasAMXCOMPLEX, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + def TCMMIMFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", + []>, T8PD, VEX_4V; + def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}", + []>, VEX_4V, VEX_WIG, T8PS; + + } // Constraints = "$src1 = $dst" + + let Constraints = "$src4 = $dst" in { + def PTCMMIMFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), + [(set TILE: $dst, + (int_x86_tcmmimfp16ps_internal GR16:$src1, GR16:$src2, + GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; + def PTCMMRLFP16PSV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6), + [(set TILE: $dst, + (int_x86_tcmmrlfp16ps_internal GR16:$src1, GR16:$src2, + GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6))]>; + } + + let usesCustomInserter = 1 in { + def PTCMMIMFP16PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tcmmimfp16ps timm:$src1, + timm:$src2, timm:$src3)]>; + def PTCMMRLFP16PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tcmmrlfp16ps timm:$src1, + timm:$src2, timm:$src3)]>; + } + } // SchedRW = [WriteSystem] +} Index: llvm/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.td +++ llvm/lib/Target/X86/X86InstrInfo.td @@ -1010,6 +1010,7 @@ def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; +def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; Index: llvm/lib/Target/X86/X86LowerAMXType.cpp =================================================================== --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -129,6 +129,8 @@ } // a * b + c // The shape depends on which operand. + case Intrinsic::x86_tcmmimfp16ps_internal: + case Intrinsic::x86_tcmmrlfp16ps_internal: case Intrinsic::x86_tdpbssd_internal: case Intrinsic::x86_tdpbsud_internal: case Intrinsic::x86_tdpbusd_internal: Index: llvm/lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86RegisterInfo.cpp +++ llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1003,6 +1003,8 @@ case X86::PTILEZEROV: case X86::PTDPBF16PSV: case X86::PTDPFP16PSV: + case X86::PTCMMIMFP16PSV: + case X86::PTCMMRLFP16PSV: MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); Index: llvm/lib/TargetParser/X86TargetParser.cpp =================================================================== --- llvm/lib/TargetParser/X86TargetParser.cpp +++ llvm/lib/TargetParser/X86TargetParser.cpp @@ -606,6 +606,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; +constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; Index: llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-tile-complex-internals.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \ +; RUN: -mattr=+amx-complex \ +; RUN: -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 +; CHECK-NEXT: tilezero %tmm1 +; CHECK-NEXT: tilezero %tmm2 +; CHECK-NEXT: tcmmimfp16ps %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tcmmrlfp16ps %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + + %c1 = call x86_amx @llvm.x86.tcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + %c2 = call x86_amx @llvm.x86.tcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) + + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) + +declare x86_amx @llvm.x86.tcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.tcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) Index: llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amxcomplex-intrinsics.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-complex --show-mc-encoding | FileCheck %s + +define void @test_tcmmimfp16ps() { +; CHECK-LABEL: test_tcmmimfp16ps: +; CHECK: # %bb.0: +; CHECK-NEXT: tcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x61,0x6c,0xca] +; CHECK-NEXT: retq # encoding: [0xc3] + call void @llvm.x86.tcmmimfp16ps(i8 1, i8 2, i8 3) + ret void +} +declare void @llvm.x86.tcmmimfp16ps(i8 %A, i8 %B, i8 %C) + +define void @test_tcmmrlfp16ps() { +; CHECK-LABEL: test_tcmmrlfp16ps: +; CHECK: # %bb.0: +; CHECK-NEXT: tcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6c,0xca] +; CHECK-NEXT: retq # encoding: [0xc3] + call void @llvm.x86.tcmmrlfp16ps(i8 1, i8 2, i8 3) + ret void +} +declare void @llvm.x86.tcmmrlfp16ps(i8 %A, i8 %B, i8 %C) Index: llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-att.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding --disassemble < %s | FileCheck %s + +# CHECK: tcmmimfp16ps %tmm4, %tmm5, %tmm6 +0xc4,0xe2,0x59,0x6c,0xf5 + +# CHECK: tcmmimfp16ps %tmm1, %tmm2, %tmm3 +0xc4,0xe2,0x71,0x6c,0xda + +# CHECK: tcmmrlfp16ps %tmm4, %tmm5, %tmm6 +0xc4,0xe2,0x58,0x6c,0xf5 + +# CHECK: tcmmrlfp16ps %tmm1, %tmm2, %tmm3 +0xc4,0xe2,0x70,0x6c,0xda Index: llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-complex-intel.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s + +# CHECK: tcmmimfp16ps tmm6, tmm5, tmm4 +0xc4,0xe2,0x59,0x6c,0xf5 + +# CHECK: tcmmimfp16ps tmm3, tmm2, tmm1 +0xc4,0xe2,0x71,0x6c,0xda + +# CHECK: tcmmrlfp16ps tmm6, tmm5, tmm4 +0xc4,0xe2,0x58,0x6c,0xf5 + +# CHECK: tcmmrlfp16ps tmm3, tmm2, tmm1 +0xc4,0xe2,0x70,0x6c,0xda Index: llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s =================================================================== --- /dev/null +++ llvm/test/MC/X86/AMX/x86-64-amx-complex-att.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: tcmmimfp16ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe2,0x59,0x6c,0xf5] + tcmmimfp16ps %tmm4, %tmm5, %tmm6 + +// CHECK: tcmmimfp16ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x71,0x6c,0xda] + tcmmimfp16ps %tmm1, %tmm2, %tmm3 + +// CHECK: tcmmrlfp16ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe2,0x58,0x6c,0xf5] + tcmmrlfp16ps %tmm4, %tmm5, %tmm6 + +// CHECK: tcmmrlfp16ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x70,0x6c,0xda] + tcmmrlfp16ps %tmm1, %tmm2, %tmm3 Index: llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s =================================================================== --- /dev/null +++ llvm/test/MC/X86/AMX/x86-64-amx-complex-intel.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: tcmmimfp16ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe2,0x59,0x6c,0xf5] + tcmmimfp16ps tmm6, tmm5, tmm4 + +// CHECK: tcmmimfp16ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x71,0x6c,0xda] + tcmmimfp16ps tmm3, tmm2, tmm1 + +// CHECK: tcmmrlfp16ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe2,0x58,0x6c,0xf5] + tcmmrlfp16ps tmm6, tmm5, tmm4 + +// CHECK: tcmmrlfp16ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x70,0x6c,0xda] + tcmmrlfp16ps tmm3, tmm2, tmm1