Index: clang/docs/ClangCommandLineReference.rst =================================================================== --- clang/docs/ClangCommandLineReference.rst +++ clang/docs/ClangCommandLineReference.rst @@ -3127,6 +3127,12 @@ .. option:: -maes, -mno-aes +.. option:: -mamx-bf16, -mno-amx-bf16 + +.. option:: -mamx-int8, -mno-amx-int8 + +.. option:: -mamx-tile, -mno-amx-tile + .. option:: -mavx, -mno-avx .. option:: -mavx2, -mno-avx2 Index: clang/include/clang/Basic/BuiltinsX86_64.def =================================================================== --- clang/include/clang/Basic/BuiltinsX86_64.def +++ clang/include/clang/Basic/BuiltinsX86_64.def @@ -101,6 +101,22 @@ TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri") + +// AMX +TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") + +TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tilestored64, "vIUcv*z", "n", "amx-tile") + +TARGET_BUILTIN(__builtin_ia32_tdpbssd, "vIUcIUcIUc", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbsud, "vIUcIUcIUc", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbusd, "vIUcIUcIUc", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8") +TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16") TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") #undef BUILTIN Index: clang/include/clang/Basic/DiagnosticSemaKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticSemaKinds.td +++ clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9342,6 +9342,8 @@ "invalid rounding argument">; def err_x86_builtin_invalid_scale : Error< "scale argument must be 1, 2, 4, or 8">; +def err_x86_builtin_tile_arg_duplicate : Error< + "tile arguments must refer to different tiles">; def err_builtin_target_unsupported : Error< "builtin is not supported on this target">; Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -3066,6 +3066,12 @@ def mno_3dnow : Flag<["-"], "mno-3dnow">, Group; def m3dnowa : Flag<["-"], "m3dnowa">, Group; def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group; +def mamx_bf16 : Flag<["-"], "mamx-bf16">, Group; +def mno_amx_bf16 : Flag<["-"], "mno-amx-bf16">, Group; +def mtamx_int8 : Flag<["-"], "mamx-int8">, Group; +def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group; +def mamx_tile : Flag<["-"], "mamx-tile">, Group; +def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; def msse : Flag<["-"], "msse">, Group; def mno_sse : Flag<["-"], "mno-sse">, Group; def msse2 : Flag<["-"], "msse2">, Group; Index: clang/include/clang/Sema/Sema.h =================================================================== --- clang/include/clang/Sema/Sema.h +++ clang/include/clang/Sema/Sema.h @@ -12142,6 +12142,16 @@ bool CheckSystemZBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall); bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall); + bool CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall); + bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, + ArrayRef ArgNums, int Low = 0, + int High = 7); + bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum, + int Low = 0, int High = 7); + bool CheckX86BuiltinTileDuplicate(CallExpr *TheCall, ArrayRef ArgNums); + bool CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall, + ArrayRef ArgNums, int Low = 0, + int High = 7); bool CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, CallExpr *TheCall); bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, Index: clang/lib/Basic/Targets/X86.h =================================================================== --- clang/lib/Basic/Targets/X86.h +++ clang/lib/Basic/Targets/X86.h @@ -125,6 +125,9 @@ bool HasPTWRITE = false; bool HasINVPCID = false; bool HasENQCMD = false; + bool HasAMXTILE = false; + bool HasAMXINT8 = false; + bool HasAMXBF16 = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; Index: clang/lib/Basic/Targets/X86.cpp =================================================================== --- clang/lib/Basic/Targets/X86.cpp +++ clang/lib/Basic/Targets/X86.cpp @@ -62,6 +62,7 @@ "cr0", "cr2", "cr3", "cr4", "cr8", "dr0", "dr1", "dr2", "dr3", "dr6", "dr7", "bnd0", "bnd1", "bnd2", "bnd3", + "tmm0", "tmm1", "tmm2", "tmm3", "tmm4", "tmm5", "tmm6", "tmm7", }; const TargetInfo::AddlRegName AddlRegNames[] = { @@ -396,7 +397,10 @@ } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") { if (Enabled) Features["xsave"] = true; - } + } else if (Name == "amx-tile" && !Enabled) { + Features["amx-bf16"] = Features["amx-int8"] = false; + } else if ((Name == "amx-bf16" || Name == "amx-int8") && Enabled) + Features["amx-tile"] = true; } /// handleTargetFeatures - Perform initialization based on the user @@ -531,6 +535,12 @@ HasINVPCID = true; } else if (Feature == "+enqcmd") { HasENQCMD = true; + } else if (Feature == "+amx-bf16") { + HasAMXBF16 = true; + } else if (Feature == "+amx-int8") { + HasAMXINT8 = true; + } else if (Feature == "+amx-tile") { + HasAMXTILE = true; } else if (Feature == "+serialize") { HasSERIALIZE = true; } else if (Feature == "+tsxldtrk") { @@ -926,6 +936,12 @@ Builder.defineMacro("__INVPCID__"); if (HasENQCMD) Builder.defineMacro("__ENQCMD__"); + if (HasAMXTILE) + Builder.defineMacro("__AMXTILE__"); + if (HasAMXINT8) + Builder.defineMacro("__AMXINT8__"); + if (HasAMXBF16) + Builder.defineMacro("__AMXBF16__"); if (HasSERIALIZE) Builder.defineMacro("__SERIALIZE__"); if (HasTSXLDTRK) @@ -1022,6 +1038,9 @@ .Case("3dnowa", true) .Case("adx", true) .Case("aes", true) + .Case("amx-bf16", true) + .Case("amx-int8", true) + .Case("amx-tile", true) .Case("avx", true) .Case("avx2", true) .Case("avx512f", true) @@ -1104,6 +1123,9 @@ return llvm::StringSwitch(Feature) .Case("adx", HasADX) .Case("aes", HasAES) + .Case("amx-bf16", HasAMXBF16) + .Case("amx-int8", HasAMXINT8) + .Case("amx-tile", HasAMXTILE) .Case("avx", SSELevel >= AVX) .Case("avx2", SSELevel >= AVX2) .Case("avx512f", SSELevel >= AVX512F) Index: clang/lib/Headers/CMakeLists.txt =================================================================== --- clang/lib/Headers/CMakeLists.txt +++ clang/lib/Headers/CMakeLists.txt @@ -2,6 +2,7 @@ adxintrin.h altivec.h ammintrin.h + amxintrin.h arm_acle.h arm_cmse.h armintr.h Index: clang/lib/Headers/amxintrin.h =================================================================== --- /dev/null +++ clang/lib/Headers/amxintrin.h @@ -0,0 +1,230 @@ +/*===--------------- amxintrin.h - AMX intrinsics -*- C++ -*-----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXINTRIN_H +#define __AMXINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) + +/// Load tile configuration from a 64-byte memory location specified by +/// "mem_addr". The tile configuration includes the tile type palette, the +/// number of bytes per row, and the number of rows. If the specified +/// palette_id is zero, that signifies the init state for both the tile +/// config and the tile data, and the tiles are zeroed. Any invalid +/// configurations will result in #GP fault. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LDTILECFG instruction. +/// +/// \param __config +/// A pointer point to m512 +static __inline__ void __DEFAULT_FN_ATTRS +_tile_loadconfig(const void *__config) +{ + __builtin_ia32_tile_loadconfig(__config); +} + +/// Stores the current tile configuration to a 64-byte memory location +/// specified by "mem_addr". The tile configuration includes the tile type +/// palette, the number of bytes per row, and the number of rows. If tiles +/// are not configured, all zeroes will be stored to memory. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the STTILECFG instruction. +/// +/// \param __config +/// A pointer point to m512 +static __inline__ void __DEFAULT_FN_ATTRS +_tile_storeconfig(void *__config) +{ + __builtin_ia32_tile_storeconfig(__config); +} + +/// Release the tile configuration to return to the init state, which +/// releases all storage it currently holds. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILERELEASE instruction. +static __inline__ void __DEFAULT_FN_ATTRS +_tile_release(void) +{ + __builtin_ia32_tilerelease(); +} + +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst" using the tile configuration previously configured +/// via "_tile_loadconfig". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADD instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer point to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +#define _tile_loadd(dst, base, stride) \ + __builtin_ia32_tileloadd64((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) + +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst" using the tile configuration previously configured +/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation +/// that the data will likely not be reused in the near future and the data +/// caching can be optimized accordingly. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADDT1 instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer point to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +#define _tile_stream_loadd(dst, base, stride) \ + __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), (__SIZE_TYPE__)(stride)) + +/// Store the tile specified by "src" to memory specifieid by "base" address and +/// "stride" using the tile configuration previously configured via +/// "_tile_loadconfig". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILESTORED instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer point to base address. +/// \param stride +/// The stride between the rows' data to be stored in memory. +#define _tile_stored(dst, base, stride) \ + __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) + +/// Zero the tile specified by "tdest". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILEZERO instruction. +/// +/// \param tile +/// The destination tile to be zero. Max size is 1024 Bytes. +#define _tile_zero(tile) \ + __builtin_ia32_tilezero((tile)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with +/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit +/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", +/// and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBSSD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbssd(dst, src0, src1) \ + __builtin_ia32_tdpbssd((dst), (src0), (src1)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with +/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate +/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in "dst", and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBSUD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbsud(dst, src0, src1) \ + __builtin_ia32_tdpbsud((dst), (src0), (src1)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with +/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit +/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", +/// and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBUSD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbusd(dst, src0, src1) \ + __builtin_ia32_tdpbusd((dst), (src0), (src1)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with +/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate +/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in +/// "dst", and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBUUD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbuud(dst, src0, src1) \ + __builtin_ia32_tdpbuud((dst), (src0), (src1)) + +/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and +/// src1, accumulating the intermediate single-precision (32-bit) floating-point +/// elements with elements in "dst", and store the 32-bit result back to tile +/// "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbf16ps(dst, src0, src1) \ + __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) + +#undef __DEFAULT_FN_ATTRS + +#endif /* __x86_64__ */ +#endif /* __AMXINTRIN_H */ Index: clang/lib/Headers/cpuid.h =================================================================== --- clang/lib/Headers/cpuid.h +++ clang/lib/Headers/cpuid.h @@ -190,6 +190,9 @@ #define bit_TSXLDTRK 0x00010000 #define bit_PCONFIG 0x00040000 #define bit_IBT 0x00100000 +#define bit_AMXBF16 0x00400000 +#define bit_AMXTILE 0x01000000 +#define bit_AMXINT8 0x02000000 /* Features in %eax for leaf 7 sub-leaf 1 */ #define bit_AVX512BF16 0x00000020 Index: clang/lib/Headers/immintrin.h =================================================================== --- clang/lib/Headers/immintrin.h +++ clang/lib/Headers/immintrin.h @@ -471,6 +471,11 @@ #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include Index: clang/lib/Sema/SemaChecking.cpp =================================================================== --- clang/lib/Sema/SemaChecking.cpp +++ clang/lib/Sema/SemaChecking.cpp @@ -88,6 +88,7 @@ #include "llvm/Support/SaveAndRestore.h" #include "llvm/Support/raw_ostream.h" #include +#include #include #include #include @@ -3607,6 +3608,65 @@ << Arg->getSourceRange(); } +bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, + ArrayRef ArgNums, + int Low, int High) { + for (int ArgNum : ArgNums) { + if (SemaBuiltinConstantArgRange(TheCall, ArgNum, Low, High)) + return true; + } + return false; +} + +bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum, + int Low, int High) { + return SemaBuiltinConstantArgRange(TheCall, ArgNum, Low, High); +} + +bool Sema::CheckX86BuiltinTileDuplicate(CallExpr *TheCall, + ArrayRef ArgNums) { + // Because the max number of tile register is 8, so here we use each bit + // to represent the usage of them in bitset<8>. + std::bitset<8> ArgValues; + for (int ArgNum : ArgNums) { + llvm::APSInt Arg; + SemaBuiltinConstantArg(TheCall, ArgNum, Arg); + int ArgExtValue = Arg.getExtValue(); + assert((ArgExtValue >= 0 || ArgExtValue < 8) && + "Incorrect tile register num."); + if (ArgValues.test(ArgExtValue)) + return Diag(TheCall->getBeginLoc(), + diag::err_x86_builtin_tile_arg_duplicate) + << TheCall->getArg(ArgNum)->getSourceRange(); + ArgValues.set(ArgExtValue); + } + return false; +} + +bool Sema::CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall, + ArrayRef ArgNums, int Low, + int High) { + return CheckX86BuiltinTileArgumentsRange(TheCall, ArgNums, Low, High) || + CheckX86BuiltinTileDuplicate(TheCall, ArgNums); +} + +bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { + switch (BuiltinID) { + default: + return false; + case X86::BI__builtin_ia32_tileloadd64: + case X86::BI__builtin_ia32_tileloaddt164: + case X86::BI__builtin_ia32_tilestored64: + case X86::BI__builtin_ia32_tilezero: + return CheckX86BuiltinTileArgumentsRange(TheCall, 0); + case X86::BI__builtin_ia32_tdpbssd: + case X86::BI__builtin_ia32_tdpbsud: + case X86::BI__builtin_ia32_tdpbusd: + case X86::BI__builtin_ia32_tdpbuud: + case X86::BI__builtin_ia32_tdpbf16ps: + return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); + } +} static bool isX86_32Builtin(unsigned BuiltinID) { // These builtins only work on x86-32 targets. switch (BuiltinID) { @@ -3640,6 +3700,10 @@ if (CheckX86BuiltinGatherScatterScale(BuiltinID, TheCall)) return true; + // If the intrinsic has a tile arguments, make sure they are valid. + if (CheckX86BuiltinTileArguments(BuiltinID, TheCall)) + return true; + // For intrinsics which take an immediate value as part of the instruction, // range check them here. int i = 0, l = 0, u = 0; Index: clang/test/CodeGen/AMX/amx.c =================================================================== --- /dev/null +++ clang/test/CodeGen/AMX/amx.c @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-int8 \ +// RUN: -target-feature +amx-bf16 -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s --check-prefixes=CHECK + +#include + +void test_amx(void *data) { + //CHECK-LABEL: @test_amx + //CHECK: call void @llvm.x86.ldtilecfg(i8* %{{.*}}) + //CHECK: call void @llvm.x86.sttilecfg(i8* %{{.*}}) + //CHECK: call void @llvm.x86.tilerelease() + //CHECK: call void @llvm.x86.tilezero(i8 3) + //CHECK: call void @llvm.x86.tileloadd64(i8 4, i8* %{{.*}}, i64 8) + //CHECK: call void @llvm.x86.tileloaddt164(i8 0, i8* %{{.*}}, i64 1) + //CHECK: call void @llvm.x86.tilestored64(i8 0, i8* %{{.*}}, i64 1) + //CHECK: call void @llvm.x86.tdpbssd(i8 1, i8 2, i8 3) + //CHECK: call void @llvm.x86.tdpbsud(i8 1, i8 2, i8 3) + //CHECK: call void @llvm.x86.tdpbusd(i8 1, i8 2, i8 3) + //CHECK: call void @llvm.x86.tdpbuud(i8 1, i8 2, i8 3) + //CHECK: call void @llvm.x86.tdpbf16ps(i8 1, i8 2, i8 3) + _tile_loadconfig(data); + _tile_storeconfig(data); + _tile_release(); + _tile_zero(3); + _tile_loadd(4, data, 8); + _tile_stream_loadd(0, data, 1); + _tile_stored(0, data, 1); + _tile_dpbssd(1, 2, 3); + _tile_dpbsud(1, 2, 3); + _tile_dpbusd(1, 2, 3); + _tile_dpbuud(1, 2, 3); + _tile_dpbf16ps(1, 2, 3); +} Index: clang/test/CodeGen/AMX/amx_errors.c =================================================================== --- /dev/null +++ clang/test/CodeGen/AMX/amx_errors.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-int8 -target-feature +amx-bf16 -emit-llvm -fsyntax-only -verify + +#include + +void test_amx(void *data) { + _tile_zero(16); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_loadd(19, data, 16); // expected-error {{argument value 19 is outside the valid range [0, 7]}} + _tile_stream_loadd(23, data, 1); // expected-error {{argument value 23 is outside the valid range [0, 7]}} + _tile_stored(88, data, 1); // expected-error {{argument value 88 is outside the valid range [0, 7]}} + _tile_dpbssd(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_dpbssd(0, 16, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_dpbuud(0, 2, 16); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_dpbsud(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dpbsud(7, 1, 7); // expected-error {{tile arguments must refer to different tiles}} + _tile_dpbsud(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_dpbf16ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}} +} Index: clang/test/CodeGen/AMX/amx_inline_asm.c =================================================================== --- /dev/null +++ clang/test/CodeGen/AMX/amx_inline_asm.c @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-int8 -target-feature +amx-bf16 -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s --check-prefixes=CHECK,X86_64 + +void f_tilemul(short a) +{ + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tdpbf16ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tdpbf16ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); +} Index: clang/test/Driver/x86-target-features.c =================================================================== --- clang/test/Driver/x86-target-features.c +++ clang/test/Driver/x86-target-features.c @@ -232,3 +232,18 @@ // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-tsxldtrk %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-TSXLDTRK %s // TSXLDTRK: "-target-feature" "+tsxldtrk" // NO-TSXLDTRK: "-target-feature" "-tsxldtrk" + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TILE %s +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TILE %s +// AMX-TILE: "-target-feature" "+amx-tile" +// NO-AMX-TILE: "-target-feature" "-amx-tile" + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-bf16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-BF16 %s +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-bf16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-BF16 %s +// AMX-BF16: "-target-feature" "+amx-bf16" +// NO-AMX-BF16: "-target-feature" "-amx-bf16" + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-int8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-INT8 %s +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-int8 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-INT8 %s +// AMX-INT8: "-target-feature" "+amx-int8" +// NO-AMX-INT8: "-target-feature" "-amx-int8" Index: clang/test/Preprocessor/x86_amx_target_features.c =================================================================== --- /dev/null +++ clang/test/Preprocessor/x86_amx_target_features.c @@ -0,0 +1,35 @@ +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=AMX-TILE %s + +// AMX-TILE: #define __AMXTILE__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-bf16 -x c -E -dM -o - %s | FileCheck -check-prefix=AMX-BF16 %s + +// AMX-BF16: #define __AMXBF16__ 1 +// AMX-BF16: #define __AMXTILE__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-int8 -x c -E -dM -o - %s | FileCheck -check-prefix=AMX-INT8 %s + +// AMX-INT8: #define __AMXINT8__ 1 +// AMX-INT8: #define __AMXTILE__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-TILE %s + +// NOAMX-TILE-NOT: #define __AMXTILE__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-bf16 -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-BF16 %s + +// NOAMX-BF16-NOT: #define __AMXBF16__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -amx-bf16 -mno-amx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-BF16 %s + +// NOAMX-BF16-NOT: #define __AMXTILE__ 1 +// NOAMX-BF16-NOT: #define __AMXBF16__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-int8 -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-INT8 %s + +// NOAMX-INT8-NOT: #define __AMXINT8__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -amx-int8 -mno-amx-tile -x c -E -dM -o - %s | FileCheck -check-prefix=NOAMX-INT8 %s + +// NOAMX-INT8-NOT: #define __AMXTILE__ 1 +// NOAMX-INT8-NOT: #define __AMXINT8__ 1 Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -4948,3 +4948,32 @@ def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">, Intrinsic<[], [], []>; } +//===----------------------------------------------------------------------===// +// AMX - Intel AMX extensions + +let TargetPrefix = "x86" in { + def int_x86_ldtilecfg : GCCBuiltin<"__builtin_ia32_tile_loadconfig">, + Intrinsic<[], [llvm_ptr_ty], []>; + def int_x86_sttilecfg : GCCBuiltin<"__builtin_ia32_tile_storeconfig">, + Intrinsic<[], [llvm_ptr_ty], []>; + def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">, + Intrinsic<[], [], []>; + def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">, + Intrinsic<[], [llvm_i8_ty], []>; + def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">, + Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>; + def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>; + def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>; + def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>; + def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], []>; +} Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4355,8 +4355,39 @@ break; } + case Intrinsic::x86_tileloadd64: + case Intrinsic::x86_tileloaddt164: + case Intrinsic::x86_tilestored64: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; + case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; + case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + if (Opc == X86::PTILESTORED) { + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } else { + SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } + ReplaceNode(Node, CNode); + return; + } } - break; } case ISD::BRIND: { Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33008,6 +33008,10 @@ const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); + auto TMMImmToTMMReg = [](unsigned Imm) { + assert (Imm < 8 && "Illegal tmm index."); + return X86::TMM0 + Imm; + }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: @@ -33290,6 +33294,69 @@ MI.eraseFromParent(); return BB; } + case X86::PTDPBSSD: + case X86::PTDPBSUD: + case X86::PTDPBUSD: + case X86::PTDPBUUD: + case X86::PTDPBF16PS: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; + case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; + case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; + case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; + case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; + } + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTILEZERO: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Imm = MI.getOperand(0).getImm(); + + BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTILELOADD: + case X86::PTILELOADDT1: + case X86::PTILESTORED: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + case X86::PTILELOADD: Opc = X86::TILELOADD; break; + case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break; + case X86::PTILESTORED: Opc = X86::TILESTORED; break; + } + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + unsigned CurOp = 0; + if (Opc != X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Define); + + MIB.add(MI.getOperand(CurOp++)); // base + MIB.add(MI.getOperand(CurOp++)); // scale + MIB.add(MI.getOperand(CurOp++)); // index -- stride + MIB.add(MI.getOperand(CurOp++)); // displacement + MIB.add(MI.getOperand(CurOp++)); // segment + + if (Opc == X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Undef); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } Index: llvm/lib/Target/X86/X86InstrAMX.td =================================================================== --- llvm/lib/Target/X86/X86InstrAMX.td +++ llvm/lib/Target/X86/X86InstrAMX.td @@ -18,9 +18,11 @@ let SchedRW = [WriteSystem] in { let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), - "ldtilecfg\t$src", []>, VEX, T8PS; + "ldtilecfg\t$src", + [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS; def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), - "sttilecfg\t$src", []>, VEX, T8PD; + "sttilecfg\t$src", + [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD; def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloadd\t{$src, $dst|$dst, $src}", []>, @@ -31,7 +33,7 @@ VEX, T8PD; let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def TILERELEASE : I<0x49, MRM_C0, (outs), (ins), - "tilerelease", []>, VEX, T8PS; + "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS; def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs), (ins sibmem:$dst, TILE:$src), "tilestored\t{$src, $dst|$dst, $src}", []>, @@ -39,6 +41,17 @@ def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins), "tilezero\t$dst", []>, VEX, T8XD; + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1, + sibmem:$src2), []>; + def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; + def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), + [(int_x86_tilezero imm:$src)]>; + } } // SchedRW } // HasAMXTILE @@ -62,6 +75,27 @@ "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8PS; } + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbssd imm:$src1, + imm:$src2, imm:$src3)]>; + def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbsud imm:$src1, + imm:$src2, imm:$src3)]>; + def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbusd imm:$src1, + imm:$src2, imm:$src3)]>; + def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbuud imm:$src1, + imm:$src2, imm:$src3)]>; + } } } // HasAMXTILE @@ -72,5 +106,14 @@ (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbf16ps imm:$src1, + imm:$src2, imm:$src3)]>; + } } } // HasAMXTILE, HasAMXBF16 Index: llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll @@ -0,0 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+amx-bf16 -verify-machineinstrs | FileCheck %s + +define void @test_amx() { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3 +; CHECK-NEXT: retq + call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7) + ret void +} + +declare void @llvm.x86.tdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2) Index: llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-int8-intrinsics.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -verify-machineinstrs | FileCheck %s + +define void @test_amx() { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: tdpbssd %tmm7, %tmm4, %tmm3 +; CHECK-NEXT: tdpbsud %tmm7, %tmm4, %tmm3 +; CHECK-NEXT: tdpbusd %tmm7, %tmm0, %tmm3 +; CHECK-NEXT: tdpbuud %tmm1, %tmm4, %tmm3 +; CHECK-NEXT: retq + call void @llvm.x86.tdpbssd(i8 3, i8 4, i8 7) + call void @llvm.x86.tdpbsud(i8 3, i8 4, i8 7) + call void @llvm.x86.tdpbusd(i8 3, i8 0, i8 7) + call void @llvm.x86.tdpbuud(i8 3, i8 4, i8 1) + ret void +} + +declare void @llvm.x86.tdpbssd(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdpbsud(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdpbusd(i8 %tile0, i8 %tile1, i8 %tile2) +declare void @llvm.x86.tdpbuud(i8 %tile0, i8 %tile1, i8 %tile2) Index: llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: ldtilecfg (%rdi) +; CHECK-NEXT: sttilecfg (%rdi) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: tilezero %tmm3 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm3 +; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm3 +; CHECK-NEXT: tilestored %tmm3, (%rsi,%rdx) +; CHECK-NEXT: retq + call void @llvm.x86.ldtilecfg(i8* %pointer) + call void @llvm.x86.sttilecfg(i8* %pointer) + call void @llvm.x86.tilerelease() + call void @llvm.x86.tilezero(i8 3) + call void @llvm.x86.tileloadd64(i8 3, i8* %base, i64 %stride) + call void @llvm.x86.tileloaddt164(i8 3, i8* %base, i64 %stride) + call void @llvm.x86.tilestored64(i8 3, i8* %base, i64 %stride) + ret void +} + +declare void @llvm.x86.tileloadd64(i8 %tile, i8* %base, i64 %stride) +declare void @llvm.x86.tileloaddt164(i8 %tile, i8* %base, i64 %stride) +declare void @llvm.x86.tilestored64(i8 %tile, i8* %base, i64 %stride) +declare void @llvm.x86.ldtilecfg(i8* %pointer) +declare void @llvm.x86.sttilecfg(i8* %pointer) +declare void @llvm.x86.tilerelease() +declare void @llvm.x86.tilezero(i8 %tile)