Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -5332,6 +5332,13 @@ Function *F = CGM.getIntrinsic(IntrinsicID); llvm::FunctionType *FTy = F->getFunctionType(); + auto isX86AMXType = [](llvm::Type* Ty) -> bool { + if (TargetExtType *TargetExtTy = dyn_cast(Ty)) + if (TargetExtTy->getName() == "x86.AMX") + return true; + return false; + }; + for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) { Value *ArgValue; // If this is a normal argument, just emit it as a scalar. @@ -5358,16 +5365,17 @@ ArgValue->getType()->getPointerTo(PtrTy->getAddressSpace())); } } - - assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) && - "Must be able to losslessly bit cast to param"); // Cast vector type (e.g., v256i32) to x86_amx, this only happen // in amx intrinsics. - if (PTy->isX86_AMXTy()) - ArgValue = Builder.CreateIntrinsic(Intrinsic::x86_cast_vector_to_tile, - {ArgValue->getType()}, {ArgValue}); - else + if (isX86AMXType(PTy)) { + ArgValue = + Builder.CreateIntrinsic(Intrinsic::x86_bitconvert_vector_to_tile, + {}, {ArgValue}); + } else { + assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) && + "Must be able to losslessly bit cast to param"); ArgValue = Builder.CreateBitCast(ArgValue, PTy); + } } Args.push_back(ArgValue); @@ -5389,15 +5397,16 @@ } } - assert(V->getType()->canLosslesslyBitCastTo(RetTy) && - "Must be able to losslessly bit cast result type"); // Cast x86_amx to vector type (e.g., v256i32), this only happen // in amx intrinsics. - if (V->getType()->isX86_AMXTy()) - V = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, {RetTy}, + if (isX86AMXType(V->getType())) { + V = Builder.CreateIntrinsic(Intrinsic::x86_bitconvert_tile_to_vector, {}, {V}); - else + } else { + assert(V->getType()->canLosslesslyBitCastTo(RetTy) && + "Must be able to losslessly bit cast result type"); V = Builder.CreateBitCast(V, RetTy); + } } if (RetTy->isVoidTy()) Index: clang/test/CodeGen/X86/amx_api.c =================================================================== --- clang/test/CodeGen/X86/amx_api.c +++ clang/test/CodeGen/X86/amx_api.c @@ -11,10 +11,10 @@ // This is an example code and integration test. void test_api(int cond, short row, short col) { //CHECK-LABEL: @test_api - //CHECK-DAG: call x86_amx @llvm.x86.tileloadd64.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpbssd.internal + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tileloadd64.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpbssd.internal //CHECK-DAG: call void @llvm.x86.tilestored64.internal __tile1024i a = {row, 8}; __tile1024i b = {8, col}; @@ -35,78 +35,78 @@ void test_tile_loadd(short row, short col) { //CHECK-LABEL: @test_tile_loadd - //CHECK-DAG: call x86_amx @llvm.x86.tileloadd64.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tileloadd64.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile1024i a = {row, col}; __tile_loadd(&a, buf, STRIDE); } void test_tile_stream_loadd(short row, short col) { //CHECK-LABEL: @test_tile_stream_loadd - //CHECK-DAG: call x86_amx @llvm.x86.tileloaddt164.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tileloaddt164.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile1024i a = {row, col}; __tile_stream_loadd(&a, buf, STRIDE); } void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbssd - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpbssd.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpbssd.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_dpbssd(&c, a, b); } void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbsud - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpbsud.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpbsud.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_dpbsud(&c, a, b); } void test_tile_dpbusd(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbusd - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpbusd.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpbusd.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_dpbusd(&c, a, b); } void test_tile_dpbuud(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbuud - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpbuud.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpbuud.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_dpbuud(&c, a, b); } void test_tile_stored(__tile1024i c) { //CHECK-LABEL: @test_tile_stored - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) //CHECK-DAG: call void @llvm.x86.tilestored64.internal __tile_stored(buf, STRIDE, c); } void test_tile_zero(__tile1024i c) { //CHECK-LABEL: @test_tile_zero - //CHECK-DAG: call x86_amx @llvm.x86.tilezero.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tilezero.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_zero(&c); } void test_tile_dpbf16ps(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbf16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpbf16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpbf16ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_dpbf16ps(&a, b, c); } void test_tile_dpfp16ps(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpfp16ps - //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) - //CHECK-DAG: call x86_amx @llvm.x86.tdpfp16ps.internal - //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> {{%.*}}) + //CHECK-DAG: call target("x86.AMX") @llvm.x86.tdpfp16ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") {{%.*}}) __tile_dpfp16ps(&a, b, c); } Index: llvm/bindings/ocaml/llvm/llvm.ml =================================================================== --- llvm/bindings/ocaml/llvm/llvm.ml +++ llvm/bindings/ocaml/llvm/llvm.ml @@ -45,7 +45,6 @@ | Token | ScalableVector | BFloat - | X86_amx end module Linkage = struct Index: llvm/bindings/ocaml/llvm/llvm.mli =================================================================== --- llvm/bindings/ocaml/llvm/llvm.mli +++ llvm/bindings/ocaml/llvm/llvm.mli @@ -82,7 +82,6 @@ | Token | ScalableVector | BFloat - | X86_amx end (** The linkage of a global value, accessed with {!linkage} and @@ -557,7 +556,7 @@ val get_module_flag : llmodule -> string -> llmetadata option (** [add_module_flag m b k v] Add a module-level flag b, with key [k] and - value [v] to the flags metadata of module [m]. It will create the + value [v] to the flags metadata of module [m]. It will create the module-level flags named metadata if it doesn't already exist. *) val add_module_flag : llmodule -> ModuleFlagBehavior.t -> string -> llmetadata -> unit Index: llvm/docs/BitCodeFormat.rst =================================================================== --- llvm/docs/BitCodeFormat.rst +++ llvm/docs/BitCodeFormat.rst @@ -1331,13 +1331,6 @@ * *paramty*: Zero or more type indices representing the parameter types of the function -TYPE_CODE_X86_AMX Record -^^^^^^^^^^^^^^^^^^^^^^^^ - -``[X86_AMX]`` - -The ``X86_AMX`` record (code 24) adds an ``x86_amx`` type to the type table. - TYPE_CODE_TARGET_TYPE Record ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -3560,24 +3560,6 @@ IEEE-754-2008 specifications for binary16, binary32, binary64, and binary128 respectively. -X86_amx Type -"""""""""""" - -:Overview: - -The x86_amx type represents a value held in an AMX tile register on an x86 -machine. The operations allowed on it are quite limited. Only few intrinsics -are allowed: stride load and store, zero and dot product. No instruction is -allowed for this type. There are no arguments, arrays, pointers, vectors -or constants of this type. - -:Syntax: - -:: - - x86_amx - - X86_mmx Type """""""""""" @@ -4021,7 +4003,7 @@ format is represented by ``0xR`` followed by 4 hexadecimal digits. All hexadecimal formats are big-endian (sign bit at the left). -There are no constants of type x86_mmx and x86_amx. +There are no constants of type x86_mmx. .. _complexconstants: Index: llvm/include/llvm-c/Core.h =================================================================== --- llvm/include/llvm-c/Core.h +++ llvm/include/llvm-c/Core.h @@ -165,7 +165,6 @@ LLVMTokenTypeKind, /**< Tokens */ LLVMScalableVectorTypeKind, /**< Scalable SIMD vector type */ LLVMBFloatTypeKind, /**< 16 bit brain floating point type */ - LLVMX86_AMXTypeKind, /**< X86 AMX */ LLVMTargetExtTypeKind, /**< Target extension type */ } LLVMTypeKind; @@ -1549,11 +1548,6 @@ */ LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C); -/** - * Create a X86 AMX type in a context. - */ -LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C); - /** * Create a token type in a context. */ @@ -1571,7 +1565,6 @@ LLVMTypeRef LLVMVoidType(void); LLVMTypeRef LLVMLabelType(void); LLVMTypeRef LLVMX86MMXType(void); -LLVMTypeRef LLVMX86AMXType(void); /** * Create a target extension type in LLVM context. Index: llvm/include/llvm/IR/DataLayout.h =================================================================== --- llvm/include/llvm/IR/DataLayout.h +++ llvm/include/llvm/IR/DataLayout.h @@ -701,8 +701,6 @@ case Type::PPC_FP128TyID: case Type::FP128TyID: return TypeSize::Fixed(128); - case Type::X86_AMXTyID: - return TypeSize::Fixed(8192); // In memory objects this is always aligned to a higher boundary, but // only 80 bits contain information. case Type::X86_FP80TyID: Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -5414,6 +5414,10 @@ DefaultAttrsIntrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>; def int_x86_cast_tile_to_vector: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>; + def int_x86_bitconvert_vector_to_tile: + DefaultAttrsIntrinsic<[llvm_x86amx_ty], [llvm_v256i32_ty], [IntrNoMem]>; + def int_x86_bitconvert_tile_to_vector: + DefaultAttrsIntrinsic<[llvm_v256i32_ty], [llvm_x86amx_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// Index: llvm/include/llvm/IR/Type.h =================================================================== --- llvm/include/llvm/IR/Type.h +++ llvm/include/llvm/IR/Type.h @@ -64,7 +64,6 @@ LabelTyID, ///< Labels MetadataTyID, ///< Metadata X86_MMXTyID, ///< MMX vectors (64 bits, X86 specific) - X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific) TokenTyID, ///< Tokens // Derived types... see DerivedTypes.h file. @@ -200,9 +199,6 @@ /// Return true if this is X86 MMX. bool isX86_MMXTy() const { return getTypeID() == X86_MMXTyID; } - /// Return true if this is X86 AMX. - bool isX86_AMXTy() const { return getTypeID() == X86_AMXTyID; } - /// Return true if this is a target extension type. bool isTargetExtTy() const { return getTypeID() == TargetExtTyID; } @@ -279,7 +275,7 @@ /// includes all first-class types except struct and array types. bool isSingleValueType() const { return isFloatingPointTy() || isX86_MMXTy() || isIntegerTy() || - isPointerTy() || isVectorTy() || isX86_AMXTy() || isTargetExtTy(); + isPointerTy() || isVectorTy() || isTargetExtTy(); } /// Return true if the type is an aggregate type. This means it is valid as @@ -295,8 +291,7 @@ bool isSized(SmallPtrSetImpl *Visited = nullptr) const { // If it's a primitive, it is always sized. if (getTypeID() == IntegerTyID || isFloatingPointTy() || - getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID || - getTypeID() == X86_AMXTyID) + getTypeID() == PointerTyID || getTypeID() == X86_MMXTyID) return true; // If it is not something that can have a size (e.g. a function or label), // it doesn't have a size. @@ -458,7 +453,6 @@ static Type *getFP128Ty(LLVMContext &C); static Type *getPPC_FP128Ty(LLVMContext &C); static Type *getX86_MMXTy(LLVMContext &C); - static Type *getX86_AMXTy(LLVMContext &C); static Type *getTokenTy(LLVMContext &C); static IntegerType *getIntNTy(LLVMContext &C, unsigned N); static IntegerType *getInt1Ty(LLVMContext &C); @@ -495,7 +489,6 @@ static PointerType *getFP128PtrTy(LLVMContext &C, unsigned AS = 0); static PointerType *getPPC_FP128PtrTy(LLVMContext &C, unsigned AS = 0); static PointerType *getX86_MMXPtrTy(LLVMContext &C, unsigned AS = 0); - static PointerType *getX86_AMXPtrTy(LLVMContext &C, unsigned AS = 0); static PointerType *getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS = 0); static PointerType *getInt1PtrTy(LLVMContext &C, unsigned AS = 0); static PointerType *getInt8PtrTy(LLVMContext &C, unsigned AS = 0); Index: llvm/lib/Analysis/ConstantFolding.cpp =================================================================== --- llvm/lib/Analysis/ConstantFolding.cpp +++ llvm/lib/Analysis/ConstantFolding.cpp @@ -564,16 +564,14 @@ Type *MapTy = Type::getIntNTy(C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedValue()); if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) { - if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && - !LoadTy->isX86_AMXTy()) + if (Res->isNullValue() && !LoadTy->isX86_MMXTy()) // Materializing a zero can be done trivially without a bitcast return Constant::getNullValue(LoadTy); Type *CastTy = LoadTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(LoadTy) : LoadTy; Res = FoldBitCast(Res, CastTy, DL); if (LoadTy->isPtrOrPtrVectorTy()) { // For vector of pointer, we needed to first convert to a vector of integer, then do vector inttoptr - if (Res->isNullValue() && !LoadTy->isX86_MMXTy() && - !LoadTy->isX86_AMXTy()) + if (Res->isNullValue() && !LoadTy->isX86_MMXTy()) return Constant::getNullValue(LoadTy); if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) // Be careful not to replace a load of an addrspace value with an inttoptr here @@ -762,7 +760,7 @@ return PoisonValue::get(Ty); if (isa(C)) return UndefValue::get(Ty); - if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) + if (C->isNullValue() && !Ty->isX86_MMXTy()) return Constant::getNullValue(Ty); if (C->isAllOnesValue() && (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy())) Index: llvm/lib/AsmParser/LLLexer.cpp =================================================================== --- llvm/lib/AsmParser/LLLexer.cpp +++ llvm/lib/AsmParser/LLLexer.cpp @@ -803,7 +803,6 @@ TYPEKEYWORD("label", Type::getLabelTy(Context)); TYPEKEYWORD("metadata", Type::getMetadataTy(Context)); TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context)); - TYPEKEYWORD("x86_amx", Type::getX86_AMXTy(Context)); TYPEKEYWORD("token", Type::getTokenTy(Context)); if (Keyword == "ptr") { Index: llvm/lib/Bitcode/Reader/BitcodeReader.cpp =================================================================== --- llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -2331,9 +2331,6 @@ case bitc::TYPE_CODE_X86_MMX: // X86_MMX ResultTy = Type::getX86_MMXTy(Context); break; - case bitc::TYPE_CODE_X86_AMX: // X86_AMX - ResultTy = Type::getX86_AMXTy(Context); - break; case bitc::TYPE_CODE_TOKEN: // TOKEN ResultTy = Type::getTokenTy(Context); break; Index: llvm/lib/Bitcode/Writer/BitcodeWriter.cpp =================================================================== --- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -968,7 +968,6 @@ case Type::LabelTyID: Code = bitc::TYPE_CODE_LABEL; break; case Type::MetadataTyID: Code = bitc::TYPE_CODE_METADATA; break; case Type::X86_MMXTyID: Code = bitc::TYPE_CODE_X86_MMX; break; - case Type::X86_AMXTyID: Code = bitc::TYPE_CODE_X86_AMX; break; case Type::TokenTyID: Code = bitc::TYPE_CODE_TOKEN; break; case Type::IntegerTyID: // INTEGER: [width] Index: llvm/lib/CodeGen/ValueTypes.cpp =================================================================== --- llvm/lib/CodeGen/ValueTypes.cpp +++ llvm/lib/CodeGen/ValueTypes.cpp @@ -202,7 +202,8 @@ case MVT::f128: return Type::getFP128Ty(Context); case MVT::ppcf128: return Type::getPPC_FP128Ty(Context); case MVT::x86mmx: return Type::getX86_MMXTy(Context); - case MVT::x86amx: return Type::getX86_AMXTy(Context); + case MVT::x86amx: + return TargetExtType::get(Context, "x86.AMX"); case MVT::i64x8: return IntegerType::get(Context, 512); case MVT::externref: // pointer to opaque struct in addrspace(10) @@ -575,7 +576,15 @@ case Type::DoubleTyID: return MVT(MVT::f64); case Type::X86_FP80TyID: return MVT(MVT::f80); case Type::X86_MMXTyID: return MVT(MVT::x86mmx); - case Type::X86_AMXTyID: return MVT(MVT::x86amx); + case Type::TargetExtTyID: { + TargetExtType *TargetExtTy = cast(Ty); + if (TargetExtTy->getName() == "x86.AMX") + return MVT(MVT::x86amx); + else { + if (HandleUnknown) return MVT(MVT::Other); + llvm_unreachable("Unknown target extension type!"); + } + } case Type::FP128TyID: return MVT(MVT::f128); case Type::PPC_FP128TyID: return MVT(MVT::ppcf128); case Type::PointerTyID: return MVT(MVT::iPTR); Index: llvm/lib/IR/AsmWriter.cpp =================================================================== --- llvm/lib/IR/AsmWriter.cpp +++ llvm/lib/IR/AsmWriter.cpp @@ -546,7 +546,6 @@ case Type::LabelTyID: OS << "label"; return; case Type::MetadataTyID: OS << "metadata"; return; case Type::X86_MMXTyID: OS << "x86_mmx"; return; - case Type::X86_AMXTyID: OS << "x86_amx"; return; case Type::TokenTyID: OS << "token"; return; case Type::IntegerTyID: OS << 'i' << cast(Ty)->getBitWidth(); Index: llvm/lib/IR/ConstantFold.cpp =================================================================== --- llvm/lib/IR/ConstantFold.cpp +++ llvm/lib/IR/ConstantFold.cpp @@ -362,7 +362,7 @@ return UndefValue::get(DestTy); } - if (V->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() && + if (V->isNullValue() && !DestTy->isX86_MMXTy() && opc != Instruction::AddrSpaceCast) return Constant::getNullValue(DestTy); Index: llvm/lib/IR/Core.cpp =================================================================== --- llvm/lib/IR/Core.cpp +++ llvm/lib/IR/Core.cpp @@ -534,8 +534,6 @@ return LLVMVectorTypeKind; case Type::X86_MMXTyID: return LLVMX86_MMXTypeKind; - case Type::X86_AMXTyID: - return LLVMX86_AMXTypeKind; case Type::TokenTyID: return LLVMTokenTypeKind; case Type::ScalableVectorTyID: @@ -651,9 +649,6 @@ LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) { return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C)); } -LLVMTypeRef LLVMX86AMXTypeInContext(LLVMContextRef C) { - return (LLVMTypeRef) Type::getX86_AMXTy(*unwrap(C)); -} LLVMTypeRef LLVMHalfType(void) { return LLVMHalfTypeInContext(LLVMGetGlobalContext()); @@ -679,9 +674,6 @@ LLVMTypeRef LLVMX86MMXType(void) { return LLVMX86MMXTypeInContext(LLVMGetGlobalContext()); } -LLVMTypeRef LLVMX86AMXType(void) { - return LLVMX86AMXTypeInContext(LLVMGetGlobalContext()); -} /*--.. Operations on function types ........................................--*/ Index: llvm/lib/IR/DataLayout.cpp =================================================================== --- llvm/lib/IR/DataLayout.cpp +++ llvm/lib/IR/DataLayout.cpp @@ -814,8 +814,6 @@ // count should be enough here. return Align(PowerOf2Ceil(getTypeStoreSize(Ty).getKnownMinValue())); } - case Type::X86_AMXTyID: - return Align(64); case Type::TargetExtTyID: { Type *LayoutTy = cast(Ty)->getLayoutType(); return getAlignment(LayoutTy, abi_or_pref); Index: llvm/lib/IR/Function.cpp =================================================================== --- llvm/lib/IR/Function.cpp +++ llvm/lib/IR/Function.cpp @@ -958,7 +958,6 @@ case Type::FP128TyID: Result += "f128"; break; case Type::PPC_FP128TyID: Result += "ppcf128"; break; case Type::X86_MMXTyID: Result += "x86mmx"; break; - case Type::X86_AMXTyID: Result += "x86amx"; break; case Type::IntegerTyID: Result += "i" + utostr(cast(Ty)->getBitWidth()); break; @@ -1373,7 +1372,7 @@ case IITDescriptor::Void: return Type::getVoidTy(Context); case IITDescriptor::VarArg: return Type::getVoidTy(Context); case IITDescriptor::MMX: return Type::getX86_MMXTy(Context); - case IITDescriptor::AMX: return Type::getX86_AMXTy(Context); + case IITDescriptor::AMX: return TargetExtType::get(Context, "x86.AMX"); case IITDescriptor::Token: return Type::getTokenTy(Context); case IITDescriptor::Metadata: return Type::getMetadataTy(Context); case IITDescriptor::Half: return Type::getHalfTy(Context); @@ -1546,7 +1545,10 @@ case IITDescriptor::Void: return !Ty->isVoidTy(); case IITDescriptor::VarArg: return true; case IITDescriptor::MMX: return !Ty->isX86_MMXTy(); - case IITDescriptor::AMX: return !Ty->isX86_AMXTy(); + case IITDescriptor::AMX: + if (TargetExtType *TargetExtTy = dyn_cast(Ty)) + return TargetExtTy->getName() != "x86.AMX"; + return true; case IITDescriptor::Token: return !Ty->isTokenTy(); case IITDescriptor::Metadata: return !Ty->isMetadataTy(); case IITDescriptor::Half: return !Ty->isHalfTy(); Index: llvm/lib/IR/LLVMContextImpl.h =================================================================== --- llvm/lib/IR/LLVMContextImpl.h +++ llvm/lib/IR/LLVMContextImpl.h @@ -1513,7 +1513,7 @@ // Basic type instances. Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy, TokenTy; - Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy, X86_AMXTy; + Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy; IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty; std::unique_ptr TheNoneToken; Index: llvm/lib/IR/LLVMContextImpl.cpp =================================================================== --- llvm/lib/IR/LLVMContextImpl.cpp +++ llvm/lib/IR/LLVMContextImpl.cpp @@ -45,7 +45,7 @@ MetadataTy(C, Type::MetadataTyID), TokenTy(C, Type::TokenTyID), X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID), PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_MMXTy(C, Type::X86_MMXTyID), - X86_AMXTy(C, Type::X86_AMXTyID), Int1Ty(C, 1), Int8Ty(C, 8), + Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32), Int64Ty(C, 64), Int128Ty(C, 128) { if (OpaquePointersCL.getNumOccurrences()) { OpaquePointers = OpaquePointersCL; Index: llvm/lib/IR/Type.cpp =================================================================== --- llvm/lib/IR/Type.cpp +++ llvm/lib/IR/Type.cpp @@ -46,7 +46,6 @@ case LabelTyID : return getLabelTy(C); case MetadataTyID : return getMetadataTy(C); case X86_MMXTyID : return getX86_MMXTy(C); - case X86_AMXTyID : return getX86_AMXTy(C); case TokenTyID : return getTokenTy(C); default: return nullptr; @@ -123,14 +122,6 @@ Ty->getPrimitiveSizeInBits().getFixedValue() == 64) return true; - // 8192-bit fixed width vector types can be losslessly converted to x86amx. - if (((isa(this)) && Ty->isX86_AMXTy()) && - getPrimitiveSizeInBits().getFixedValue() == 8192) - return true; - if ((isX86_AMXTy() && isa(Ty)) && - Ty->getPrimitiveSizeInBits().getFixedValue() == 8192) - return true; - // At this point we have only various mismatches of the first class types // remaining and ptr->ptr. Just select the lossless conversions. Everything // else is not lossless. Conservatively assume we can't losslessly convert @@ -170,7 +161,6 @@ case Type::FP128TyID: return TypeSize::Fixed(128); case Type::PPC_FP128TyID: return TypeSize::Fixed(128); case Type::X86_MMXTyID: return TypeSize::Fixed(64); - case Type::X86_AMXTyID: return TypeSize::Fixed(8192); case Type::IntegerTyID: return TypeSize::Fixed(cast(this)->getBitWidth()); case Type::FixedVectorTyID: @@ -233,7 +223,6 @@ Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; } Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; } Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; } -Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; } IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; } IntegerType *Type::getInt8Ty(LLVMContext &C) { return &C.pImpl->Int8Ty; } @@ -278,10 +267,6 @@ return getX86_MMXTy(C)->getPointerTo(AS); } -PointerType *Type::getX86_AMXPtrTy(LLVMContext &C, unsigned AS) { - return getX86_AMXTy(C)->getPointerTo(AS); -} - PointerType *Type::getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS) { return getIntNTy(C, N)->getPointerTo(AS); } @@ -652,8 +637,7 @@ bool ArrayType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() && - !ElemTy->isTokenTy() && !ElemTy->isX86_AMXTy() && - !isa(ElemTy); + !ElemTy->isTokenTy() && !isa(ElemTy); } //===----------------------------------------------------------------------===// @@ -779,8 +763,7 @@ bool PointerType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && - !ElemTy->isMetadataTy() && !ElemTy->isTokenTy() && - !ElemTy->isX86_AMXTy(); + !ElemTy->isMetadataTy() && !ElemTy->isTokenTy(); } bool PointerType::isLoadableOrStorableType(Type *ElemTy) { @@ -854,6 +837,11 @@ return TargetTypeInfo(Type::getInt8PtrTy(C, 0), TargetExtType::HasZeroInit, TargetExtType::CanBeGlobal); } + if (Name.startswith("x86.AMX")) { + return TargetTypeInfo( + ArrayType::get(FixedVectorType::get(Type::getIntNTy(C, 32), 16), 16), + TargetExtType::HasZeroInit, TargetExtType::CanBeGlobal); + } return TargetTypeInfo(Type::getVoidTy(C)); } Index: llvm/lib/IR/TypedPointerType.cpp =================================================================== --- llvm/lib/IR/TypedPointerType.cpp +++ llvm/lib/IR/TypedPointerType.cpp @@ -38,6 +38,5 @@ bool TypedPointerType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && - !ElemTy->isMetadataTy() && !ElemTy->isTokenTy() && - !ElemTy->isX86_AMXTy(); + !ElemTy->isMetadataTy() && !ElemTy->isTokenTy(); } Index: llvm/lib/IR/Verifier.cpp =================================================================== --- llvm/lib/IR/Verifier.cpp +++ llvm/lib/IR/Verifier.cpp @@ -2596,8 +2596,6 @@ "Function takes metadata but isn't an intrinsic", &Arg, &F); Check(!Arg.getType()->isTokenTy(), "Function takes token but isn't an intrinsic", &Arg, &F); - Check(!Arg.getType()->isX86_AMXTy(), - "Function takes x86_amx but isn't an intrinsic", &Arg, &F); } // Check that swifterror argument is only used by loads and stores. @@ -2610,8 +2608,6 @@ if (!IsIntrinsic) { Check(!F.getReturnType()->isTokenTy(), "Function returns a token but isn't an intrinsic", &F); - Check(!F.getReturnType()->isX86_AMXTy(), - "Function returns a x86_amx but isn't an intrinsic", &F); } // Get the function metadata attachments. @@ -3393,8 +3389,6 @@ if (!Call.getCalledFunction()) { Check(!FTy->getReturnType()->isTokenTy(), "Return type cannot be token for indirect call!"); - Check(!FTy->getReturnType()->isX86_AMXTy(), - "Return type cannot be x86_amx for indirect call!"); } if (Function *F = Call.getCalledFunction()) @@ -4966,9 +4960,6 @@ for (Value *V : Call.args()) { if (auto *MD = dyn_cast(V)) visitMetadataAsValue(*MD, Call.getCaller()); - if (auto *Const = dyn_cast(V)) - Check(!Const->getType()->isX86_AMXTy(), - "const x86_amx is not allowed in argument!"); } switch (ID) { Index: llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp =================================================================== --- llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1018,7 +1018,6 @@ switch (T->getTypeID()) { case Type::BFloatTyID: - case Type::X86_AMXTyID: case Type::TokenTyID: case Type::TargetExtTyID: llvm_unreachable("These should never be used!!!"); Index: llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -330,7 +330,6 @@ case Type::LabelTyID: case Type::MetadataTyID: case Type::X86_MMXTyID: - case Type::X86_AMXTyID: case Type::TokenTyID: case Type::TypedPointerTyID: case Type::TargetExtTyID: Index: llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp =================================================================== --- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -54,6 +54,12 @@ } #endif +static bool isAMXBitConvert(Instruction *II, Value*& V) { + return match(II, + m_Intrinsic(m_Value(V))) || + match(II, m_Intrinsic(m_Value(V))); +} + static cl::opt X86ScalarizeAMX("enable-x86-scalar-amx", cl::init(false), cl::Hidden, cl::desc("X86: enable AMX scalarizition.")); @@ -218,7 +224,7 @@ return ResVec; } else { - auto *BitCast = cast(Tile); + auto *BitCast = cast(Tile); Value *Vec = BitCast->getOperand(0); assert(isV256I32Ty(Vec->getType()) && "bitcast from non-v256i32 to x86amx"); // tilestore.scalarize.cols.body: @@ -301,16 +307,16 @@ Value *CurrentInner = &*InnerLoopHeader->begin(); FixedVectorType *V256I32Ty = FixedVectorType::get(B.getInt32Ty(), 256); - auto *BitCastAcc = cast(Acc); + auto *BitCastAcc = cast(Acc); Value *VecC = BitCastAcc->getOperand(0); assert(isV256I32Ty(VecC->getType()) && "bitcast from non-v256i32 to x86amx"); // TODO else create BitCast from x86amx to v256i32. // Store x86amx to memory, and reload from memory // to vector. However with -O0, it doesn't happen. - auto *BitCastLHS = cast(LHS); + auto *BitCastLHS = cast(LHS); Value *VecA = BitCastLHS->getOperand(0); assert(isV256I32Ty(VecA->getType()) && "bitcast from non-v256i32 to x86amx"); - auto *BitCastRHS = cast(RHS); + auto *BitCastRHS = cast(RHS); Value *VecB = BitCastRHS->getOperand(0); assert(isV256I32Ty(VecB->getType()) && "bitcast from non-v256i32 to x86amx"); @@ -497,12 +503,12 @@ // insert one bitcast as required Builder.SetInsertPoint(End->getFirstNonPHI()); Value *ResAMX = - Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext())); + Builder.CreateIntrinsic(Intrinsic::x86_bitconvert_vector_to_tile, {}, {ResVec}); // Delete TileDP intrinsic and do some clean-up. for (Use &U : llvm::make_early_inc_range(TileDP->uses())) { Instruction *I = cast(U.getUser()); Value *Vec; - if (match(I, m_BitCast(m_Value(Vec)))) { + if (isAMXBitConvert(I, Vec)) { I->replaceAllUsesWith(ResVec); I->eraseFromParent(); } @@ -541,12 +547,13 @@ // insert one bitcast as required Builder.SetInsertPoint(End->getFirstNonPHI()); Value *ResAMX = - Builder.CreateBitCast(ResVec, Type::getX86_AMXTy(Builder.getContext())); + Builder.CreateIntrinsic(Intrinsic::x86_bitconvert_vector_to_tile, {}, {ResVec}); + // Delete tileloadd6 intrinsic and do some clean-up for (Use &U : llvm::make_early_inc_range(TileLoadStore->uses())) { Instruction *I = cast(U.getUser()); Value *Vec; - if (match(I, m_BitCast(m_Value(Vec)))) { + if (isAMXBitConvert(I, Vec)) { I->replaceAllUsesWith(ResVec); I->eraseFromParent(); } @@ -564,7 +571,7 @@ for (Use &U : llvm::make_early_inc_range(TileZero->uses())) { Instruction *I = cast(U.getUser()); Value *Vec; - if (match(I, m_BitCast(m_Value(Vec)))) { + if (isAMXBitConvert(I, Vec)) { I->replaceAllUsesWith(VecZero); I->eraseFromParent(); } Index: llvm/lib/Target/X86/X86LowerAMXType.cpp =================================================================== --- llvm/lib/Target/X86/X86LowerAMXType.cpp +++ llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -68,12 +68,26 @@ #define DEBUG_TYPE "lower-amx-type" +static bool isX86AMXType(Type* Ty) { + if (TargetExtType *TargetExtTy = dyn_cast(Ty)) + if (TargetExtTy->getName() == "x86.AMX") + return true; + + return false; +} + static bool isAMXCast(Instruction *II) { return match(II, m_Intrinsic(m_Value())) || match(II, m_Intrinsic(m_Value())); } +static bool isAMXBitConvert(Instruction *II) { + return match(II, + m_Intrinsic()) || + match(II, m_Intrinsic()); +} + static bool isAMXIntrinsic(Value *I) { auto *II = dyn_cast(I); if (!II) @@ -82,10 +96,10 @@ return false; // Check if return type or parameter is x86_amx. If it is x86_amx // the intrinsic must be x86 amx intrinsics. - if (II->getType()->isX86_AMXTy()) + if (isX86AMXType(II->getType())) return true; for (Value *V : II->args()) { - if (V->getType()->isX86_AMXTy()) + if (isX86AMXType(V->getType())) return true; } @@ -99,7 +113,7 @@ const DataLayout &DL = M->getDataLayout(); LLVMContext &Ctx = Builder.getContext(); - auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx)); + auto AllocaAlignment = DL.getPrefTypeAlign(TargetExtType::get(Ctx, "x86.AMX")); unsigned AllocaAS = DL.getAllocaAddrSpace(); AllocaInst *AllocaRes = new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front()); @@ -223,9 +237,9 @@ public: X86LowerAMXType(Function &F) : Func(F) {} bool visit(); - void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast); - void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST); - bool transformBitcast(BitCastInst *Bitcast); + void combineLoadBitcast(LoadInst *LD, IntrinsicInst *Bitcast); + void combineBitcastStore(IntrinsicInst *Bitcast, StoreInst *ST); + bool transformBitcast(IntrinsicInst *Bitcast); }; // %src = load <256 x i32>, <256 x i32>* %addr, align 64 @@ -233,7 +247,7 @@ // --> // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, // i8* %addr, i64 %stride64) -void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { +void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, IntrinsicInst *Bitcast) { Value *Row = nullptr, *Col = nullptr; Use &U = *(Bitcast->use_begin()); unsigned OpNo = U.getOperandNo(); @@ -258,7 +272,7 @@ // --> // call void @llvm.x86.tilestored64.internal(%row, %col, %addr, // %stride64, %13) -void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { +void X86LowerAMXType::combineBitcastStore(IntrinsicInst *Bitcast, StoreInst *ST) { Value *Tile = Bitcast->getOperand(0); auto *II = cast(Tile); @@ -291,7 +305,7 @@ } // transform bitcast to instructions. -bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { +bool X86LowerAMXType::transformBitcast(IntrinsicInst *Bitcast) { IRBuilder<> Builder(Bitcast); AllocaInst *AllocaAddr; Value *I8Ptr, *Stride; @@ -303,7 +317,7 @@ Stride = Builder.getInt64(64); }; - if (Bitcast->getType()->isX86_AMXTy()) { + if (isX86AMXType(Bitcast->getType())) { // %2 = bitcast <256 x i32> %src to x86_amx // --> // %addr = alloca <256 x i32>, align 64 @@ -356,12 +370,13 @@ for (BasicBlock *BB : post_order(&Func)) { for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*BB))) { - auto *Bitcast = dyn_cast(&Inst); + auto *Bitcast = isAMXBitConvert(&Inst) ? cast(&Inst) : nullptr; + if (!Bitcast) continue; Value *Src = Bitcast->getOperand(0); - if (Bitcast->getType()->isX86_AMXTy()) { + if (isX86AMXType(Bitcast->getType())) { if (Bitcast->user_empty()) { DeadInsts.push_back(Bitcast); continue; @@ -392,7 +407,7 @@ DeadInsts.push_back(Bitcast); if (LD->hasOneUse()) DeadInsts.push_back(LD); - } else if (Src->getType()->isX86_AMXTy()) { + } else if (isX86AMXType(Src->getType())) { if (Bitcast->user_empty()) { DeadInsts.push_back(Bitcast); continue; @@ -462,7 +477,7 @@ } static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { - assert(TileDef->getType()->isX86_AMXTy() && "Not define tile!"); + assert(isX86AMXType(TileDef->getType()) && "Not define tile!"); auto *II = cast(TileDef); assert(II && "Not tile intrinsic!"); Value *Row = II->getOperand(0); @@ -481,7 +496,7 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { Value *V = U.get(); - assert(V->getType()->isX86_AMXTy() && "Not define tile!"); + assert(isX86AMXType(V->getType()) && "Not define tile!"); // Get tile shape. IntrinsicInst *II = nullptr; @@ -673,7 +688,7 @@ SmallVector AMXDefInsts; for (Instruction &I : BB) { - if (!I.getType()->isX86_AMXTy()) + if (!isX86AMXType(I.getType())) continue; if (isa(&I)) PHIInsts.push_back(&I); @@ -1131,7 +1146,7 @@ Stride = Builder.getInt64(64); }; - if (AMXCast->getType()->isX86_AMXTy()) { + if (isX86AMXType(AMXCast->getType())) { // %2 = amxcast <225 x i32> %src to x86_amx // call void @llvm.x86.tilestored64.internal(i16 15, i16 60, // i8* %addr3, i64 60, x86_amx %2) Index: llvm/lib/Target/X86/X86PreAMXConfig.cpp =================================================================== --- llvm/lib/Target/X86/X86PreAMXConfig.cpp +++ llvm/lib/Target/X86/X86PreAMXConfig.cpp @@ -57,11 +57,21 @@ #define DEBUG_TYPE "pre-amx-config" +static bool isX86AMXType(Type* Ty) { + if (TargetExtType *TargetExtTy = dyn_cast(Ty)) + if (TargetExtTy->getName() == "x86.AMX") + return true; + + return false; +} + static bool isAMXIntrinsic(IntrinsicInst *II) { - for (Value *Operand : II->operands()) - if (Operand->getType()->isX86_AMXTy()) + for (Value *Operand : II->operands()) { + if (isX86AMXType(Operand->getType())) return true; - return II->getType()->isX86_AMXTy(); + } + + return isX86AMXType(II->getType()); } static bool isTileLoad(IntrinsicInst *II) { @@ -75,10 +85,12 @@ #ifndef NDEBUG static bool onlyTileDef(IntrinsicInst *II) { - for (Value *Operand : II->operands()) - if (Operand->getType()->isX86_AMXTy()) + for (Value *Operand : II->operands()) { + if (isX86AMXType(Operand->getType())) return false; - return II->getType()->isX86_AMXTy(); + } + + return isX86AMXType(II->getType()); } static bool brokenVolatile(Instruction *I) { @@ -216,11 +228,11 @@ // All Loads should be operands of KeyAMX. // All tile operands of KeyAMX should come from Loads. for (Value *Op : KeyAMX->operands()) { - if (Op->getType()->isX86_AMXTy()) + if (isX86AMXType(Op->getType())) { if (!Loads.erase(Op)) return false; + } } - // The def of KeyAMX should be stored into mem. // Todo: is it key amx can be no def? return Loads.empty() && (ST == cast(KeyAMX)); @@ -230,7 +242,7 @@ SmallVector &Shapes) { for (unsigned I = 0; I < KeyAMX->getNumOperands(); I++) { Value *Op = KeyAMX->getOperand(I); - if (!Op->getType()->isX86_AMXTy()) + if (!isX86AMXType(Op->getType())) continue; IntrinsicInst *TileDef = dyn_cast(Op); assert((TileDef && isTileLoad(TileDef)) && Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2533,12 +2533,7 @@ Value *Addr = LI->getOperand(0); if (Addr == &CI || isa(Addr)) return nullptr; - // Don't tranform "load <256 x i32>, <256 x i32>*" to - // "load x86_amx, x86_amx*", because x86_amx* is invalid. - // TODO: Remove this check when bitcast between vector and x86_amx - // is replaced with a specific intrinsic. - if (DestTy->isX86_AMXTy()) - return nullptr; + if (LI->hasOneUse() && LI->isSimple()) continue; // If a LoadInst has more than one use, changing the type of loaded Index: llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -618,14 +618,7 @@ // Note that we should not do this for pointer<->integer casts, // because that would result in type punning. if (Load.hasOneUse()) { - // Don't transform when the type is x86_amx, it makes the pass that lower - // x86_amx type happy. Type *LoadTy = Load.getType(); - if (auto *BC = dyn_cast(Load.user_back())) { - assert(!LoadTy->isX86_AMXTy() && "Load from x86_amx* should not happen!"); - if (BC->getType()->isX86_AMXTy()) - return nullptr; - } if (auto *CastUser = dyn_cast(Load.user_back())) { Type *DestTy = CastUser->getDestTy(); @@ -1145,13 +1138,7 @@ // Fold away bit casts of the stored value by storing the original type. if (auto *BC = dyn_cast(V)) { - assert(!BC->getType()->isX86_AMXTy() && - "store to x86_amx* should not happen!"); V = BC->getOperand(0); - // Don't transform when the type is x86_amx, it makes the pass that lower - // x86_amx type happy. - if (V->getType()->isX86_AMXTy()) - return false; if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) { combineStoreToNewValue(IC, SI, V); return true; Index: llvm/test/CodeGen/X86/AMX/amx-across-func.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -199,12 +199,12 @@ ; O0-NEXT: popq %rbp ; O0-NEXT: tilerelease ; O0-NEXT: retq - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + %3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %4 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) call void @foo() - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6) + %5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) + %6 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %5, target("x86.AMX") %3, target("x86.AMX") %4) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %6) ret void } @@ -449,17 +449,17 @@ br i1 %5, label %13, label %11 6: %7 = phi i32 [ %9, %6 ], [ 0, %2 ] - %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32) + %8 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32) call void @foo() - tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %8) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %8) call void @foo() %9 = add i32 %7, 1 %10 = icmp eq i32 %9, 0 br i1 %10, label %4, label %6 11: call void @foo() - %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32, x86_amx %12) + %12 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32, target("x86.AMX") %12) br label %17 13: %14 = icmp eq i32 %9, 7 @@ -615,9 +615,9 @@ %4 = icmp sgt i32 %0, 0 br i1 %4, label %5, label %8 5: - %6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32) + %6 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr @buf, i64 32) call void @foo() - tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %6) call void @foo() %7 = add i32 %3, 1 br label %2 @@ -625,6 +625,6 @@ ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll +++ llvm/test/CodeGen/X86/AMX/amx-combine-undef.ll @@ -4,17 +4,17 @@ define void @undef_2phi(ptr%buf) { ; CHECK-LABEL: @undef_2phi( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: [[TMP0:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: br i1 undef, label [[L2]], label [[L3:%.*]] ; CHECK: l2: -; CHECK-NEXT: [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi target("x86.AMX") [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] ; CHECK-NEXT: br i1 undef, label [[L3]], label [[EXIT:%.*]] ; CHECK: l3: -; CHECK-NEXT: [[TMP2:%.*]] = phi x86_amx [ [[TMP1]], [[L2]] ], [ [[T1]], [[L1]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = phi target("x86.AMX") [ [[TMP1]], [[L2]] ], [ [[T1]], [[L1]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, target("x86.AMX") [[TMP2]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -23,8 +23,8 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %l3 l2: @@ -33,8 +33,8 @@ l3: %t4 = phi <256 x i32> [ %t3, %l2], [ %t2, %l1 ] - %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) - call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t5) + %t5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, target("x86.AMX") %t5) br label %exit exit: @@ -44,14 +44,14 @@ define void @foo_undef(ptr%buf) { ; CHECK-LABEL: @foo_undef( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: [[TMP0:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: -; CHECK-NEXT: [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = phi target("x86.AMX") [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, target("x86.AMX") [[TMP1]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -60,14 +60,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, target("x86.AMX") %t4) br label %exit exit: @@ -77,14 +77,14 @@ define void @foo_zero(ptr%buf) { ; CHECK-LABEL: @foo_zero( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: [[TMP0:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: -; CHECK-NEXT: [[TMP1:%.*]] = phi x86_amx [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = phi target("x86.AMX") [ [[TMP0]], [[ENTRY:%.*]] ], [ [[T1]], [[L1]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[BUF:%.*]], i64 1024, target("x86.AMX") [[TMP1]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -93,14 +93,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, target("x86.AMX") %t4) br label %exit exit: @@ -114,15 +114,15 @@ ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW:%.*]], i16 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[TMP1]], i64 32, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 [[ROW:%.*]], i16 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[TMP1]], i64 32, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP3:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024 ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP3]], [[L1]] ] ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[TMP0]], align 1024 -; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 32, ptr [[TMP0]], i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 32, ptr [[TMP0]], i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 32, ptr [[BUF:%.*]], i64 1024, target("x86.AMX") [[TMP5]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -131,14 +131,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %row, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 32, ptr %buf, i64 1024, x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 32, ptr %buf, i64 1024, target("x86.AMX") %t4) br label %exit exit: @@ -152,17 +152,17 @@ ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 [[COL:%.*]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 [[COL:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[TMP1]], i64 [[TMP3]], x86_amx [[T1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[TMP1]], i64 [[TMP3]], target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP4:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024 ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: ; CHECK-NEXT: [[T3:%.*]] = phi <256 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP4]], [[L1]] ] ; CHECK-NEXT: store <256 x i32> [[T3]], ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr [[TMP0]], i64 [[TMP6]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[BUF:%.*]], i64 1024, x86_amx [[TMP7]]) +; CHECK-NEXT: [[TMP7:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr [[TMP0]], i64 [[TMP6]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[BUF:%.*]], i64 1024, target("x86.AMX") [[TMP7]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -171,14 +171,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 %col) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 %col) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ zeroinitializer, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %buf, i64 1024, x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %buf, i64 1024, target("x86.AMX") %t4) br label %exit exit: @@ -191,8 +191,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: @@ -206,14 +206,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t4) store <256 x i32> %t5, ptr %buf br label %exit @@ -227,8 +227,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: @@ -243,14 +243,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t4) %t6 = call <256 x i32> @llvm.abs.v256i32(<256 x i32> %t5, i1 1) store <256 x i32> %t6, ptr %buf br label %exit @@ -260,9 +260,9 @@ } declare <256 x i32> @llvm.abs.v256i32(<256 x i32>, i1) -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-combine.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-combine.ll +++ llvm/test/CodeGen/X86/AMX/amx-combine.ll @@ -3,12 +3,12 @@ define void @combine_store(ptr%p) { ; CHECK-LABEL: @combine_store( -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 16, i16 64) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, target("x86.AMX") [[T1]]) ; CHECK-NEXT: ret void ; - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 16, i16 64) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) store <256 x i32> %t2, ptr %p, align 64 ret void } @@ -16,41 +16,41 @@ define <256 x i32> @combine_store_2user(ptr%p) { ; CHECK-LABEL: @combine_store_2user( ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 16, i16 64) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP1]], align 1024 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, x86_amx [[T1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64, target("x86.AMX") [[T1]]) ; CHECK-NEXT: ret <256 x i32> [[TMP2]] ; - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 16, i16 64) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) store <256 x i32> %t2, ptr %p, align 64 ret <256 x i32> %t2 } define void @combine_load(ptr%p, ptr%p2) { ; CHECK-LABEL: @combine_load( -; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, target("x86.AMX") [[TMP1]]) ; CHECK-NEXT: ret void ; %t1 = load <256 x i32>, ptr %p, align 64 - %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) - call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, target("x86.AMX") %t2) ret void } define void @combine_cast_across_store(ptr%p, ptr%p2) { ; CHECK-LABEL: @combine_cast_across_store( -; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64) +; CHECK-NEXT: [[TMP1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[P:%.*]], i64 64) ; CHECK-NEXT: store <256 x i32> zeroinitializer, ptr [[P]], align 64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, target("x86.AMX") [[TMP1]]) ; CHECK-NEXT: ret void ; %t1 = load <256 x i32>, ptr %p, align 64 store <256 x i32> zeroinitializer, ptr %p, align 64 - %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) - call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, target("x86.AMX") %t2) ret void } @@ -59,14 +59,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[P:%.*]], align 64 ; CHECK-NEXT: store <256 x i32> [[T1]], ptr [[TMP1]], align 1024 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr [[TMP1]], i64 64) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, target("x86.AMX") [[TMP2]]) ; CHECK-NEXT: ret <256 x i32> [[T1]] ; %t1 = load <256 x i32>, ptr %p, align 64 - %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) - call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2) - %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, target("x86.AMX") %t2) + %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t2) ret <256 x i32> %t3 } @@ -75,16 +75,16 @@ ; CHECK-NEXT: [[TMP1:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[P:%.*]], align 64 ; CHECK-NEXT: store <256 x i32> [[T1]], ptr [[TMP1]], align 1024 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr [[TMP1]], i64 16) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, x86_amx [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, x86_amx [[TMP2]], x86_amx [[TMP2]], x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr [[TMP1]], i64 16) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr [[P2:%.*]], i64 64, target("x86.AMX") [[TMP2]]) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP2]]) ; CHECK-NEXT: ret <256 x i32> [[T1]] ; %t1 = load <256 x i32>, ptr %p, align 64 - %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) - call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, x86_amx %t2) - %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t2) - call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, x86_amx %t2, x86_amx %t2, x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %p2, i64 64, target("x86.AMX") %t2) + %t3 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t2) + call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 16, i16 64, target("x86.AMX") %t2, target("x86.AMX") %t2, target("x86.AMX") %t2) ret <256 x i32> %t3 } @@ -104,11 +104,11 @@ ; CHECK-NEXT: [[A_COL:%.*]] = load i16, ptr [[A_COL_PTR]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[A_COL]], 4 ; CHECK-NEXT: [[A_TILE_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[A_COL]], ptr [[A_TILE_PTR]], i64 64) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[A_COL]], ptr [[A_TILE_PTR]], i64 64) ; CHECK-NEXT: [[C_TILE_PTR:%.*]] = getelementptr inbounds [[STRUCT___TILE1024I_STR:%.*]], ptr [[C:%.*]], i64 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[B_ROW]], ptr [[C_TILE_PTR]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[B_ROW]], ptr [[TMP0]], i64 64) -; CHECK-NEXT: [[RES:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[A_ROW]], i16 [[B_ROW]], i16 [[A_COL]], x86_amx [[TMP3]], x86_amx [[TMP2]], x86_amx [[TMP4]]) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[A_ROW]], i16 [[B_ROW]], ptr [[C_TILE_PTR]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[B_ROW]], ptr [[TMP0]], i64 64) +; CHECK-NEXT: [[RES:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 [[A_ROW]], i16 [[B_ROW]], i16 [[A_COL]], target("x86.AMX") [[TMP3]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP4]]) ; CHECK-NEXT: ret void ; entry: @@ -123,16 +123,16 @@ %a.tile = load <256 x i32>, ptr %a.tile.ptr, align 64 %c.tile.ptr = getelementptr inbounds %struct.__tile1024i_str, ptr %c, i64 0, i32 3 %c.tile = load <256 x i32>, ptr %c.tile.ptr, align 64 - %c.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %c.tile) - %a.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %a.tile) - %b.amx = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %b.tile) - %res = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %a.row, i16 %b.row, i16 %a.col, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx) + %c.amx = tail call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %c.tile) + %a.amx = tail call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %a.tile) + %b.amx = tail call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %b.tile) + %res = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %a.row, i16 %b.row, i16 %a.col, target("x86.AMX") %c.amx, target("x86.AMX") %a.amx, target("x86.AMX") %b.amx) ret void } -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) +declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-config.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-config.ll +++ llvm/test/CodeGen/X86/AMX/amx-config.ll @@ -151,26 +151,26 @@ br i1 %4, label %11, label %7 7: ; preds = %3 - %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf, i64 32) - %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32) - %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32) + %8 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf, i64 32) + %9 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32) + %10 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf, i64 32) br label %15 11: ; preds = %3 - %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf2, i64 32) - %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32) - %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32) + %12 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %6, i16 %1, ptr @buf2, i64 32) + %13 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32) + %14 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %6, i16 %2, ptr @buf2, i64 32) br label %15 15: ; preds = %11, %7 - %16 = phi x86_amx [ %12, %11 ], [ %8, %7 ] - %17 = phi x86_amx [ %13, %11 ], [ %9, %7 ] - %18 = phi x86_amx [ %14, %11 ], [ %10, %7 ] - %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, x86_amx %18, x86_amx %16, x86_amx %17) - tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, ptr @buf, i64 32, x86_amx %19) + %16 = phi target("x86.AMX") [ %12, %11 ], [ %8, %7 ] + %17 = phi target("x86.AMX") [ %13, %11 ], [ %9, %7 ] + %18 = phi target("x86.AMX") [ %14, %11 ], [ %10, %7 ] + %19 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %6, i16 %2, i16 %1, target("x86.AMX") %18, target("x86.AMX") %16, target("x86.AMX") %17) + tail call void @llvm.x86.tilestored64.internal(i16 %6, i16 %2, ptr @buf, i64 32, target("x86.AMX") %19) ret <4 x i32> %xmm0 } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll +++ llvm/test/CodeGen/X86/AMX/amx-configO0toO0.ll @@ -769,8 +769,8 @@ %12 = load i16, ptr %n.addr.i, align 2 %13 = load ptr, ptr %base.addr.i56, align 8 %14 = load i64, ptr %stride.addr.i57, align 8 - %15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %11, i16 %12, ptr %13, i64 %14) #2 - %16 = bitcast x86_amx %15 to <256 x i32> + %15 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %11, i16 %12, ptr %13, i64 %14) #2 + %16 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %15) %17 = load ptr, ptr %dst.addr.i35, align 8 %tile.i41 = getelementptr inbounds %struct.__tile1024i_str, ptr %17, i32 0, i32 3 store <256 x i32> %16, ptr %tile.i41, align 64 @@ -792,8 +792,8 @@ %25 = load i16, ptr %n.addr.i59, align 2 %26 = load ptr, ptr %base.addr.i60, align 8 %27 = load i64, ptr %stride.addr.i61, align 8 - %28 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %24, i16 %25, ptr %26, i64 %27) #2 - %29 = bitcast x86_amx %28 to <256 x i32> + %28 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %24, i16 %25, ptr %26, i64 %27) #2 + %29 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %28) %30 = load ptr, ptr %dst.addr.i28, align 8 %tile.i34 = getelementptr inbounds %struct.__tile1024i_str, ptr %30, i32 0, i32 3 store <256 x i32> %29, ptr %tile.i34, align 64 @@ -815,8 +815,8 @@ %38 = load i16, ptr %n.addr.i63, align 2 %39 = load ptr, ptr %base.addr.i64, align 8 %40 = load i64, ptr %stride.addr.i65, align 8 - %41 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %37, i16 %38, ptr %39, i64 %40) #2 - %42 = bitcast x86_amx %41 to <256 x i32> + %41 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %37, i16 %38, ptr %39, i64 %40) #2 + %42 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %41) %43 = load ptr, ptr %dst.addr.i21, align 8 %tile.i27 = getelementptr inbounds %struct.__tile1024i_str, ptr %43, i32 0, i32 3 store <256 x i32> %42, ptr %tile.i27, align 64 @@ -841,8 +841,8 @@ %51 = load i16, ptr %n.addr.i67, align 2 %52 = load ptr, ptr %base.addr.i68, align 8 %53 = load i64, ptr %stride.addr.i69, align 8 - %54 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %50, i16 %51, ptr %52, i64 %53) #2 - %55 = bitcast x86_amx %54 to <256 x i32> + %54 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %50, i16 %51, ptr %52, i64 %53) #2 + %55 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %54) %56 = load ptr, ptr %dst.addr.i14, align 8 %tile.i20 = getelementptr inbounds %struct.__tile1024i_str, ptr %56, i32 0, i32 3 store <256 x i32> %55, ptr %tile.i20, align 64 @@ -864,8 +864,8 @@ %64 = load i16, ptr %n.addr.i71, align 2 %65 = load ptr, ptr %base.addr.i72, align 8 %66 = load i64, ptr %stride.addr.i73, align 8 - %67 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %63, i16 %64, ptr %65, i64 %66) #2 - %68 = bitcast x86_amx %67 to <256 x i32> + %67 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %63, i16 %64, ptr %65, i64 %66) #2 + %68 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %67) %69 = load ptr, ptr %dst.addr.i7, align 8 %tile.i13 = getelementptr inbounds %struct.__tile1024i_str, ptr %69, i32 0, i32 3 store <256 x i32> %68, ptr %tile.i13, align 64 @@ -887,8 +887,8 @@ %77 = load i16, ptr %n.addr.i75, align 2 %78 = load ptr, ptr %base.addr.i76, align 8 %79 = load i64, ptr %stride.addr.i77, align 8 - %80 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %76, i16 %77, ptr %78, i64 %79) #2 - %81 = bitcast x86_amx %80 to <256 x i32> + %80 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %76, i16 %77, ptr %78, i64 %79) #2 + %81 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %80) %82 = load ptr, ptr %dst.addr.i, align 8 %tile.i = getelementptr inbounds %struct.__tile1024i_str, ptr %82, i32 0, i32 3 store <256 x i32> %81, ptr %tile.i, align 64 @@ -929,13 +929,13 @@ %91 = load i16, ptr %n.addr.i82, align 2 %92 = load i16, ptr %k.addr.i, align 2 %93 = load <256 x i32>, ptr %dst.addr.i83, align 64 - %94 = bitcast <256 x i32> %93 to x86_amx + %94 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %93) %95 = load <256 x i32>, ptr %src1.addr.i, align 64 - %96 = bitcast <256 x i32> %95 to x86_amx + %96 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %95) %97 = load <256 x i32>, ptr %src2.addr.i, align 64 - %98 = bitcast <256 x i32> %97 to x86_amx - %99 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %90, i16 %91, i16 %92, x86_amx %94, x86_amx %96, x86_amx %98) #2 - %100 = bitcast x86_amx %99 to <256 x i32> + %98 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %97) + %99 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %90, i16 %91, i16 %92, target("x86.AMX") %94, target("x86.AMX") %96, target("x86.AMX") %98) #2 + %100 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %99) %101 = load ptr, ptr %dst.addr.i44, align 8 %tile6.i = getelementptr inbounds %struct.__tile1024i_str, ptr %101, i32 0, i32 3 store <256 x i32> %100, ptr %tile6.i, align 64 @@ -962,16 +962,18 @@ %109 = load ptr, ptr %base.addr.i87, align 8 %110 = load i64, ptr %stride.addr.i88, align 8 %111 = load <256 x i32>, ptr %tile.addr.i, align 64 - %112 = bitcast <256 x i32> %111 to x86_amx - call void @llvm.x86.tilestored64.internal(i16 %107, i16 %108, ptr %109, i64 %110, x86_amx %112) #2 + %112 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %111) + call void @llvm.x86.tilestored64.internal(i16 %107, i16 %108, ptr %109, i64 %110, target("x86.AMX") %112) #2 ret void } declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1 -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #2 -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #2 -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #2 +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #2 +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) #2 +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) #2 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3 +declare <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32>) attributes #0 = { noinline nounwind optnone } attributes #1 = { argmemonly nofree nosync nounwind willreturn writeonly } Index: llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll +++ llvm/test/CodeGen/X86/AMX/amx-configO2toO0-lower.ll @@ -14,29 +14,29 @@ ; CHECK-NEXT: %tobool.not = icmp eq i32 %cond, 0 ; CHECK-NEXT: br i1 %tobool.not, label %if.else, label %if.then ; CHECK: if.then: -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) ; CHECK-NEXT: br label %if.end ; CHECK: if.else: -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) ; CHECK-NEXT: br label %if.end ; CHECK: if.end: -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64) -; CHECK-NEXT: %{{[0-9]+}} = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64, x86_amx %{{[0-9]+}}) -; CHECK-NEXT: %{{[0-9]+}} = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64) -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, x86_amx %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr %{{[0-9]+}}, i64 64) +; CHECK-NEXT: %{{[0-9]+}} = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr %{{[0-9]+}}, i64 64) +; CHECK-NEXT: %{{[0-9]+}} = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64) +; CHECK-NEXT: %{{[0-9]+}} = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, target("x86.AMX") %{{[0-9]+}}, target("x86.AMX") %{{[0-9]+}}, target("x86.AMX") %{{[0-9]+}}) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64, target("x86.AMX") %{{[0-9]+}}) +; CHECK-NEXT: %{{[0-9]+}} = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %{{[0-9]+}}, i64 64) +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, target("x86.AMX") %{{[0-9]+}}) ; CHECK-NEXT: ret void entry: @@ -44,31 +44,31 @@ br i1 %tobool.not, label %if.else, label %if.then if.then: ; preds = %entry - %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) - %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) - %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) + %0 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) + %1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) + %2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) br label %if.end if.else: ; preds = %entry - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) + %3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) + %4 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) + %5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) br label %if.end if.end: ; preds = %if.else, %if.then - %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] - %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] - %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, x86_amx %6) + %a.sroa.1094.0.in = phi target("x86.AMX") [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi target("x86.AMX") [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi target("x86.AMX") [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, target("x86.AMX") %c.sroa.1044.0.in, target("x86.AMX") %a.sroa.1094.0.in, target("x86.AMX") %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, target("x86.AMX") %6) ret void } ; Function Attrs: nounwind -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) ; Function Attrs: nounwind -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) ; Function Attrs: nounwind -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll +++ llvm/test/CodeGen/X86/AMX/amx-configO2toO0-precfg.ll @@ -31,8 +31,8 @@ ; CHECK-NEXT: store i8 [[TMP10]], ptr [[AMX_TMM_0_SHAPE_ROW]], align 1 ; CHECK-NEXT: store i16 8, ptr [[TMP9]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP7]]) -; CHECK-NEXT: [[I8:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, ptr @buf, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 8, ptr [[I4]], i64 64, x86_amx [[I8]]) +; CHECK-NEXT: [[I8:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, ptr @buf, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 8, ptr [[I4]], i64 64, target("x86.AMX") [[I8]]) ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP6]], i64 0 ; CHECK-NEXT: store i8 1, ptr [[TMP11]], align 1 @@ -41,8 +41,8 @@ ; CHECK-NEXT: store i8 8, ptr [[AMX_TMM_0_SHAPE_ROW1]], align 1 ; CHECK-NEXT: store i16 [[COL:%.*]], ptr [[TMP12]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP6]]) -; CHECK-NEXT: [[I9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr @buf, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[I2]], i64 64, x86_amx [[I9]]) +; CHECK-NEXT: [[I9:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr @buf, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[I2]], i64 64, target("x86.AMX") [[I9]]) ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP5]], i64 0 ; CHECK-NEXT: store i8 1, ptr [[TMP13]], align 1 @@ -52,8 +52,8 @@ ; CHECK-NEXT: store i8 [[TMP15]], ptr [[AMX_TMM_0_SHAPE_ROW2]], align 1 ; CHECK-NEXT: store i16 [[COL]], ptr [[TMP14]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP5]]) -; CHECK-NEXT: [[I10:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr @buf, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I]], i64 64, x86_amx [[I10]]) +; CHECK-NEXT: [[I10:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr @buf, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I]], i64 64, target("x86.AMX") [[I10]]) ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: if.else: ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP4]], align 4 @@ -65,8 +65,8 @@ ; CHECK-NEXT: store i8 [[TMP18]], ptr [[AMX_TMM_0_SHAPE_ROW3]], align 1 ; CHECK-NEXT: store i16 8, ptr [[TMP17]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP4]]) -; CHECK-NEXT: [[I11:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, ptr @buf2, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 8, ptr [[I4]], i64 64, x86_amx [[I11]]) +; CHECK-NEXT: [[I11:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, ptr @buf2, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 8, ptr [[I4]], i64 64, target("x86.AMX") [[I11]]) ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP3]], i64 0 ; CHECK-NEXT: store i8 1, ptr [[TMP19]], align 1 @@ -75,8 +75,8 @@ ; CHECK-NEXT: store i8 8, ptr [[AMX_TMM_0_SHAPE_ROW4]], align 1 ; CHECK-NEXT: store i16 [[COL]], ptr [[TMP20]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP3]]) -; CHECK-NEXT: [[I12:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr @buf2, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[I2]], i64 64, x86_amx [[I12]]) +; CHECK-NEXT: [[I12:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr @buf2, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 [[COL]], ptr [[I2]], i64 64, target("x86.AMX") [[I12]]) ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP2]], i64 0 ; CHECK-NEXT: store i8 1, ptr [[TMP21]], align 1 @@ -86,8 +86,8 @@ ; CHECK-NEXT: store i8 [[TMP23]], ptr [[AMX_TMM_0_SHAPE_ROW5]], align 1 ; CHECK-NEXT: store i16 [[COL]], ptr [[TMP22]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP2]]) -; CHECK-NEXT: [[I13:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr @buf2, i64 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I]], i64 64, x86_amx [[I13]]) +; CHECK-NEXT: [[I13:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr @buf2, i64 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I]], i64 64, target("x86.AMX") [[I13]]) ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP1]], align 4 @@ -113,11 +113,11 @@ ; CHECK-NEXT: store i8 [[TMP31]], ptr [[AMX_TMM_3_SHAPE_ROW]], align 1 ; CHECK-NEXT: store i16 [[COL]], ptr [[TMP30]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP1]]) -; CHECK-NEXT: [[I14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, ptr [[I4]], i64 64) -; CHECK-NEXT: [[I15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr [[I2]], i64 64) -; CHECK-NEXT: [[I16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I]], i64 64) -; CHECK-NEXT: [[I17:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL]], i16 8, x86_amx [[I16]], x86_amx [[I14]], x86_amx [[I15]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I6]], i64 64, x86_amx [[I17]]) +; CHECK-NEXT: [[I14:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 8, ptr [[I4]], i64 64) +; CHECK-NEXT: [[I15:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 [[COL]], ptr [[I2]], i64 64) +; CHECK-NEXT: [[I16:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I]], i64 64) +; CHECK-NEXT: [[I17:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL]], i16 8, target("x86.AMX") [[I16]], target("x86.AMX") [[I14]], target("x86.AMX") [[I15]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I6]], i64 64, target("x86.AMX") [[I17]]) ; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0 ; CHECK-NEXT: store i8 1, ptr [[TMP32]], align 1 @@ -127,8 +127,8 @@ ; CHECK-NEXT: store i8 [[TMP34]], ptr [[AMX_TMM_0_SHAPE_ROW7]], align 1 ; CHECK-NEXT: store i16 [[COL]], ptr [[TMP33]], align 2 ; CHECK-NEXT: call void @llvm.x86.ldtilecfg.internal(ptr [[TMP0]]) -; CHECK-NEXT: [[I18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I6]], i64 64) -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr @buf, i64 32, x86_amx [[I18]]) +; CHECK-NEXT: [[I18:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL]], ptr [[I6]], i64 64) +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL]], ptr @buf, i64 32, target("x86.AMX") [[I18]]) ; CHECK-NEXT: ret void ; entry: @@ -140,39 +140,39 @@ br i1 %tobool.not, label %if.else, label %if.then if.then: ; preds = %entry - %i8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %i4, i64 64, x86_amx %i8) - %i9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %i2, i64 64, x86_amx %i9) - %i10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %i, i64 64, x86_amx %i10) + %i8 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %i4, i64 64, target("x86.AMX") %i8) + %i9 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %i2, i64 64, target("x86.AMX") %i9) + %i10 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %i, i64 64, target("x86.AMX") %i10) br label %if.end if.else: ; preds = %entry - %i11 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %i4, i64 64, x86_amx %i11) - %i12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %i2, i64 64, x86_amx %i12) - %i13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %i, i64 64, x86_amx %i13) + %i11 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 8, ptr %i4, i64 64, target("x86.AMX") %i11) + %i12 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 %col, ptr %i2, i64 64, target("x86.AMX") %i12) + %i13 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %i, i64 64, target("x86.AMX") %i13) br label %if.end if.end: ; preds = %if.else, %if.then - %i14 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr %i4, i64 64) - %i15 = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr %i2, i64 64) - %i16 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %i, i64 64) - %i17 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %i16, x86_amx %i14, x86_amx %i15) - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %i6, i64 64, x86_amx %i17) - %i18 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %i6, i64 64) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, x86_amx %i18) + %i14 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr %i4, i64 64) + %i15 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr %i2, i64 64) + %i16 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %i, i64 64) + %i17 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, target("x86.AMX") %i16, target("x86.AMX") %i14, target("x86.AMX") %i15) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %i6, i64 64, target("x86.AMX") %i17) + %i18 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %i6, i64 64) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, target("x86.AMX") %i18) ret void } ; Function Attrs: nounwind -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) ; Function Attrs: nounwind -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) ; Function Attrs: nounwind -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll +++ llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll @@ -152,26 +152,26 @@ br i1 %tobool.not, label %if.else, label %if.then if.then: ; preds = %entry - %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) - %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) - %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) + %0 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) + %1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) + %2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) br label %if.end if.else: ; preds = %entry - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) + %3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) + %4 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) + %5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) br label %if.end if.end: ; preds = %if.else, %if.then - %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] - %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] - %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, x86_amx %6) + %a.sroa.1094.0.in = phi target("x86.AMX") [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi target("x86.AMX") [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi target("x86.AMX") [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, target("x86.AMX") %c.sroa.1044.0.in, target("x86.AMX") %a.sroa.1094.0.in, target("x86.AMX") %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, target("x86.AMX") %6) ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-error.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-error.ll +++ llvm/test/CodeGen/X86/AMX/amx-error.ll @@ -7,10 +7,10 @@ entry: ; CHECK: Failed to config tile register %t0 = load i16, ptr @row, align 2 - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 %t0, i16 64) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %t0, i16 64) %t2 = load i16, ptr @col, align 2 - %t3 = call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 %t2) + %t3 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 16, i16 %t2) ret void } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) Index: llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir +++ llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir @@ -14,29 +14,29 @@ br i1 %tobool.not, label %if.else, label %if.then if.then: ; preds = %entry - %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) - %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) - %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) + %0 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf, i64 32) + %1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf, i64 32) + %2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf, i64 32) br label %if.end if.else: ; preds = %entry - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) + %3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 8, ptr @buf2, i64 32) + %4 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %col, ptr @buf2, i64 32) + %5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr @buf2, i64 32) br label %if.end if.end: ; preds = %if.else, %if.then - %a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ] - %b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ] - %c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ] - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in) - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, x86_amx %6) + %a.sroa.1094.0.in = phi target("x86.AMX") [ %3, %if.else ], [ %0, %if.then ] + %b.sroa.1069.0.in = phi target("x86.AMX") [ %4, %if.else ], [ %1, %if.then ] + %c.sroa.1044.0.in = phi target("x86.AMX") [ %5, %if.else ], [ %2, %if.then ] + %6 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, target("x86.AMX") %c.sroa.1044.0.in, target("x86.AMX") %a.sroa.1094.0.in, target("x86.AMX") %b.sroa.1069.0.in) + tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 32, target("x86.AMX") %6) ret void } - declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #1 - declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #1 - declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #1 + declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #1 + declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) #1 + declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) #1 attributes #0 = { "target-features"="+amx-int8,+avx512f" } attributes #1 = { nounwind "target-features"="+amx-int8,+avx512f" } Index: llvm/test/CodeGen/X86/AMX/amx-fp16.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-fp16.ll +++ llvm/test/CodeGen/X86/AMX/amx-fp16.ll @@ -24,18 +24,18 @@ ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) - %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) - %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %d = call x86_amx @llvm.x86.tdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, x86_amx %d) + %a = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) + %b = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) + %c = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 8) + %d = call target("x86.AMX") @llvm.x86.tdpfp16ps.internal(i16 8, i16 8, i16 8, target("x86.AMX") %c, target("x86.AMX") %a, target("x86.AMX") %b) + %e = call target("x86.AMX") @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, target("x86.AMX") %d) ret void } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpfp16ps.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-gemm.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-gemm.ll +++ llvm/test/CodeGen/X86/AMX/amx-gemm.ll @@ -73,7 +73,7 @@ for.body6: ; preds = %for.cond.cleanup9, %for.cond3.preheader %indvars.iv199 = phi i64 [ %indvars.iv.next200, %for.cond.cleanup9 ], [ 0, %for.cond3.preheader ] - %i3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) + %i3 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 16, i16 64) %i4 = shl nsw i64 %indvars.iv199, 4 br i1 %cmp8163, label %for.body10.preheader, label %for.cond.cleanup9 @@ -82,109 +82,109 @@ br i1 %i1, label %for.cond.cleanup9.loopexit.unr-lcssa, label %for.body10 for.cond.cleanup9.loopexit.unr-lcssa: ; preds = %for.body10, %for.body10.preheader - %.lcssa.ph = phi x86_amx [ undef, %for.body10.preheader ], [ %i68, %for.body10 ] + %.lcssa.ph = phi target("x86.AMX") [ undef, %for.body10.preheader ], [ %i68, %for.body10 ] %indvars.iv.unr = phi i64 [ 0, %for.body10.preheader ], [ %indvars.iv.next.7, %for.body10 ] - %c.sroa.8127.2.in164.unr = phi x86_amx [ %i3, %for.body10.preheader ], [ %i68, %for.body10 ] + %c.sroa.8127.2.in164.unr = phi target("x86.AMX") [ %i3, %for.body10.preheader ], [ %i68, %for.body10 ] br i1 %lcmp.mod.not, label %for.cond.cleanup9, label %for.body10.epil for.body10.epil: ; preds = %for.body10.epil, %for.cond.cleanup9.loopexit.unr-lcssa %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body10.epil ], [ %indvars.iv.unr, %for.cond.cleanup9.loopexit.unr-lcssa ] - %c.sroa.8127.2.in164.epil = phi x86_amx [ %i11, %for.body10.epil ], [ %c.sroa.8127.2.in164.unr, %for.cond.cleanup9.loopexit.unr-lcssa ] + %c.sroa.8127.2.in164.epil = phi target("x86.AMX") [ %i11, %for.body10.epil ], [ %c.sroa.8127.2.in164.unr, %for.cond.cleanup9.loopexit.unr-lcssa ] %epil.iter = phi i64 [ %epil.iter.sub, %for.body10.epil ], [ %xtraiter, %for.cond.cleanup9.loopexit.unr-lcssa ] %i5 = shl nsw i64 %indvars.iv.epil, 4 %add.ptr14.epil = getelementptr inbounds i32, ptr %add.ptr, i64 %i5 - %i7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr14.epil, i64 %mul15) + %i7 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr14.epil, i64 %mul15) %i8 = mul nsw i64 %i5, %conv23 %add.ptr22.epil = getelementptr inbounds i32, ptr %add.ptr19, i64 %i8 - %i10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr22.epil, i64 %mul24) - %i11 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %c.sroa.8127.2.in164.epil, x86_amx %i7, x86_amx %i10) + %i10 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr22.epil, i64 %mul24) + %i11 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %c.sroa.8127.2.in164.epil, target("x86.AMX") %i7, target("x86.AMX") %i10) %indvars.iv.next.epil = add nuw nsw i64 %indvars.iv.epil, 1 %epil.iter.sub = add i64 %epil.iter, -1 %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0 br i1 %epil.iter.cmp.not, label %for.cond.cleanup9, label %for.body10.epil for.cond.cleanup9: ; preds = %for.body10.epil, %for.cond.cleanup9.loopexit.unr-lcssa, %for.body6 - %c.sroa.8127.2.in.lcssa = phi x86_amx [ %i3, %for.body6 ], [ %.lcssa.ph, %for.cond.cleanup9.loopexit.unr-lcssa ], [ %i11, %for.body10.epil ] + %c.sroa.8127.2.in.lcssa = phi target("x86.AMX") [ %i3, %for.body6 ], [ %.lcssa.ph, %for.cond.cleanup9.loopexit.unr-lcssa ], [ %i11, %for.body10.epil ] %add.ptr31 = getelementptr inbounds i32, ptr %add.ptr28, i64 %i4 - tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %add.ptr31, i64 %mul24, x86_amx %c.sroa.8127.2.in.lcssa) + tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %add.ptr31, i64 %mul24, target("x86.AMX") %c.sroa.8127.2.in.lcssa) %indvars.iv.next200 = add nuw nsw i64 %indvars.iv199, 1 %exitcond204.not = icmp eq i64 %indvars.iv.next200, %wide.trip.count203 br i1 %exitcond204.not, label %for.cond.cleanup5, label %for.body6 for.body10: ; preds = %for.body10, %for.body10.preheader %indvars.iv = phi i64 [ %indvars.iv.next.7, %for.body10 ], [ 0, %for.body10.preheader ] - %c.sroa.8127.2.in164 = phi x86_amx [ %i68, %for.body10 ], [ %i3, %for.body10.preheader ] + %c.sroa.8127.2.in164 = phi target("x86.AMX") [ %i68, %for.body10 ], [ %i3, %for.body10.preheader ] %niter = phi i64 [ %niter.nsub.7, %for.body10 ], [ %unroll_iter, %for.body10.preheader ] %i13 = shl nsw i64 %indvars.iv, 4 %add.ptr14 = getelementptr inbounds i32, ptr %add.ptr, i64 %i13 - %i15 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr14, i64 %mul15) + %i15 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr14, i64 %mul15) %i16 = mul nsw i64 %i13, %conv23 %add.ptr22 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i16 - %i18 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr22, i64 %mul24) - %i19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %c.sroa.8127.2.in164, x86_amx %i15, x86_amx %i18) + %i18 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %add.ptr22, i64 %mul24) + %i19 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %c.sroa.8127.2.in164, target("x86.AMX") %i15, target("x86.AMX") %i18) %indvars.iv.next = shl i64 %indvars.iv, 4 %i20 = or i64 %indvars.iv.next, 16 %add.ptr14.1 = getelementptr inbounds i32, ptr %add.ptr, i64 %i20 - %i22 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.1, i64 %mul15) + %i22 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.1, i64 %mul15) %i23 = mul nsw i64 %i20, %conv23 %add.ptr22.1 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i23 - %i25 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.1, i64 %mul24) - %i26 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i19, x86_amx %i22, x86_amx %i25) + %i25 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.1, i64 %mul24) + %i26 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i19, target("x86.AMX") %i22, target("x86.AMX") %i25) %indvars.iv.next.1 = shl i64 %indvars.iv, 4 %i27 = or i64 %indvars.iv.next.1, 32 %add.ptr14.2 = getelementptr inbounds i32, ptr %add.ptr, i64 %i27 - %i29 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.2, i64 %mul15) + %i29 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.2, i64 %mul15) %i30 = mul nsw i64 %i27, %conv23 %add.ptr22.2 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i30 - %i32 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.2, i64 %mul24) - %i33 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i26, x86_amx %i29, x86_amx %i32) + %i32 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.2, i64 %mul24) + %i33 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i26, target("x86.AMX") %i29, target("x86.AMX") %i32) %indvars.iv.next.2 = shl i64 %indvars.iv, 4 %i34 = or i64 %indvars.iv.next.2, 48 %add.ptr14.3 = getelementptr inbounds i32, ptr %add.ptr, i64 %i34 - %i36 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.3, i64 %mul15) + %i36 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.3, i64 %mul15) %i37 = mul nsw i64 %i34, %conv23 %add.ptr22.3 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i37 - %i39 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.3, i64 %mul24) - %i40 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i33, x86_amx %i36, x86_amx %i39) + %i39 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.3, i64 %mul24) + %i40 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i33, target("x86.AMX") %i36, target("x86.AMX") %i39) %indvars.iv.next.3 = shl i64 %indvars.iv, 4 %i41 = or i64 %indvars.iv.next.3, 64 %add.ptr14.4 = getelementptr inbounds i32, ptr %add.ptr, i64 %i41 - %i43 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.4, i64 %mul15) + %i43 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.4, i64 %mul15) %i44 = mul nsw i64 %i41, %conv23 %add.ptr22.4 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i44 - %i46 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.4, i64 %mul24) - %i47 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i40, x86_amx %i43, x86_amx %i46) + %i46 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.4, i64 %mul24) + %i47 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i40, target("x86.AMX") %i43, target("x86.AMX") %i46) %indvars.iv.next.4 = shl i64 %indvars.iv, 4 %i48 = or i64 %indvars.iv.next.4, 80 %add.ptr14.5 = getelementptr inbounds i32, ptr %add.ptr, i64 %i48 - %i50 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.5, i64 %mul15) + %i50 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.5, i64 %mul15) %i51 = mul nsw i64 %i48, %conv23 %add.ptr22.5 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i51 - %i53 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.5, i64 %mul24) - %i54 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i47, x86_amx %i50, x86_amx %i53) + %i53 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.5, i64 %mul24) + %i54 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i47, target("x86.AMX") %i50, target("x86.AMX") %i53) %indvars.iv.next.5 = shl i64 %indvars.iv, 4 %i55 = or i64 %indvars.iv.next.5, 96 %add.ptr14.6 = getelementptr inbounds i32, ptr %add.ptr, i64 %i55 - %i57 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.6, i64 %mul15) + %i57 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.6, i64 %mul15) %i58 = mul nsw i64 %i55, %conv23 %add.ptr22.6 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i58 - %i60 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.6, i64 %mul24) - %i61 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i54, x86_amx %i57, x86_amx %i60) + %i60 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.6, i64 %mul24) + %i61 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i54, target("x86.AMX") %i57, target("x86.AMX") %i60) %indvars.iv.next.6 = shl i64 %indvars.iv, 4 %i62 = or i64 %indvars.iv.next.6, 112 %add.ptr14.7 = getelementptr inbounds i32, ptr %add.ptr, i64 %i62 - %i64 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.7, i64 %mul15) + %i64 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr14.7, i64 %mul15) %i65 = mul nsw i64 %i62, %conv23 %add.ptr22.7 = getelementptr inbounds i32, ptr %add.ptr19, i64 %i65 - %i67 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.7, i64 %mul24) - %i68 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %i61, x86_amx %i64, x86_amx %i67) + %i67 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %add.ptr22.7, i64 %mul24) + %i68 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %i61, target("x86.AMX") %i64, target("x86.AMX") %i67) %indvars.iv.next.7 = add nuw nsw i64 %indvars.iv, 8 %niter.nsub.7 = add i64 %niter, -8 %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0 br i1 %niter.ncmp.7, label %for.cond.cleanup9.loopexit.unr-lcssa, label %for.body10 } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -204,24 +204,24 @@ for.body17: ; preds = %for.body17, %for.body17.lr.ph %indvars.iv = phi i64 [ %1, %for.body17.lr.ph ], [ %indvars.iv.next, %for.body17 ] - %11 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %conv4, i16 %conv5) - %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %conv4, i16 %conv3, ptr %arrayidx, i64 %conv20) + %11 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %conv4, i16 %conv5) + %12 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %conv4, i16 %conv3, ptr %arrayidx, i64 %conv20) %13 = trunc i64 %indvars.iv to i32 %mul23 = shl nsw i32 %13, 2 %add24 = add nsw i32 %mul23, %mul22 %idxprom25 = sext i32 %add24 to i64 %arrayidx26 = getelementptr inbounds i8, ptr %B_rcr4, i64 %idxprom25 - %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %conv13, ptr %arrayidx26, i64 %conv28) - %15 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %conv4, i16 %conv13, i16 %conv3, x86_amx %11, x86_amx %12, x86_amx %14) + %14 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 %conv13, ptr %arrayidx26, i64 %conv28) + %15 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %conv4, i16 %conv13, i16 %conv3, target("x86.AMX") %11, target("x86.AMX") %12, target("x86.AMX") %14) %16 = add nsw i64 %indvars.iv, %10 %arrayidx33 = getelementptr inbounds i32, ptr %C, i64 %16 - tail call void @llvm.x86.tilestored64.internal(i16 %conv4, i16 %conv5, ptr %arrayidx33, i64 %conv34, x86_amx %15) + tail call void @llvm.x86.tilestored64.internal(i16 %conv4, i16 %conv5, ptr %arrayidx33, i64 %conv34, target("x86.AMX") %15) %indvars.iv.next = add i64 %indvars.iv, %2 %cmp15 = icmp slt i64 %indvars.iv.next, %3 br i1 %cmp15, label %for.body17, label %for.cond.cleanup16 } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-greedy-ra.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-greedy-ra.ll +++ llvm/test/CodeGen/X86/AMX/amx-greedy-ra.ll @@ -28,13 +28,13 @@ ; CHECK-NEXT: $ax = COPY [[LEA64_32r]].sub_16bit ; CHECK-NEXT: RET 0, killed $ax entry: - %0 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) - %1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) - %2 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) - %3 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, x86_amx %2, x86_amx %0, x86_amx %1) + %0 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %1 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %2 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %3 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, target("x86.AMX") %2, target("x86.AMX") %0, target("x86.AMX") %1) %4 = add i16 %row, %col ret i16 %4 } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll +++ llvm/test/CodeGen/X86/AMX/amx-intrinsic-chain.ll @@ -36,23 +36,23 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: - %a1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %A_mem, i64 64) + %a1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %A_mem, i64 64) %addr = getelementptr inbounds i8, ptr %A_mem, i64 1024 - %a2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %addr, i64 64) - %c1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %C_mem, i64 64) + %a2 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %addr, i64 64) + %c1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %C_mem, i64 64) %caddr = getelementptr inbounds i8, ptr %C_mem, i64 1024 - %c2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %caddr, i64 64) + %c2 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %caddr, i64 64) br label %dotpd dotpd: - %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %B_mem, i64 64) - %dp1 = call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %c1, x86_amx %a1, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr nonnull %C_mem, i64 64, x86_amx %dp1) - %dp2 = call x86_amx @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, x86_amx %c2, x86_amx %a2, x86_amx %b) - call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr nonnull %caddr, i64 64, x86_amx %dp2) + %b = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr nonnull %B_mem, i64 64) + %dp1 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %c1, target("x86.AMX") %a1, target("x86.AMX") %b) + call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr nonnull %C_mem, i64 64, target("x86.AMX") %dp1) + %dp2 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 16, i16 64, i16 64, target("x86.AMX") %c2, target("x86.AMX") %a2, target("x86.AMX") %b) + call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr nonnull %caddr, i64 64, target("x86.AMX") %dp2) ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -28,12 +28,12 @@ ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: jmp foo # TAILCALL - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) - call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3 - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %6) + %3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %4 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + %5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) + %6 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %5, target("x86.AMX") %3, target("x86.AMX") %4) + call void @llvm.dbg.value(metadata target("x86.AMX") %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3 + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %6) tail call void @foo() ret void } @@ -97,19 +97,19 @@ br i1 undef, label %if.true, label %if.false if.true: - %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8) - %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) - %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t4) + %t1 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %0, i16 8) + %t2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + %t3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) + %t4 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %t1, target("x86.AMX") %t2, target("x86.AMX") %t3) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %t4) br label %exit if.false: - %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) - %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) - %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %t8) + %t5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %t6 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + %t7 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) + %t8 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %t5, target("x86.AMX") %t6, target("x86.AMX") %t7) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %t8) br label %exit exit: @@ -154,8 +154,8 @@ exit: %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ] - %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1) - tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6) + %6 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %5, i16 %1) + tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, target("x86.AMX") %6) ret void } @@ -213,14 +213,14 @@ amx1: %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ] - %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1) - tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, x86_amx %6) + %6 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %5, i16 %1) + tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, ptr @buf, i64 32, target("x86.AMX") %6) br label %exit amx2: %7 = phi i16 [ %3, %if.true ], [ %4, %if.false ] - %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, ptr @buf, i64 32, x86_amx %8) + %8 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, ptr @buf, i64 32, target("x86.AMX") %8) br label %exit exit: @@ -271,8 +271,8 @@ br i1 undef, label %if.true, label %if.false if.true: - %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, ptr @buf, i64 32, x86_amx %3) + %3 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %0, i16 %2) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, ptr @buf, i64 32, target("x86.AMX") %3) br label %loop.bb2 if.false: @@ -345,8 +345,8 @@ %4 = phi i16 [ %2, %if.true ], [ %3, %if.false ] %5 = icmp eq i16 %4, 7 %6 = add i16 %0, %4 - %7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, ptr @buf, i64 32, x86_amx %7) + %7 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %0, i16 %6) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, ptr @buf, i64 32, target("x86.AMX") %7) br i1 %5, label %loop.bb1, label %exit exit: @@ -356,10 +356,10 @@ declare dso_local void @foo() nounwind declare void @llvm.dbg.value(metadata, metadata, metadata) -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!1} Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll +++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll @@ -35,7 +35,7 @@ ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], 4 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <256 x i32> [[TMP10]] to x86_amx +; CHECK-NEXT: [[TMP11:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP10]]) ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER2:%.*]] ; CHECK: tileload.scalarize.rows.header2: ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV5:%.*]] = phi i16 [ 0, [[CONTINUE]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP6:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH4:%.*]] ] @@ -67,7 +67,7 @@ ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND7:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP6]], 4 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND7]], label [[TILELOAD_SCALARIZE_ROWS_HEADER2]], label [[CONTINUE1:%.*]] ; CHECK: continue1: -; CHECK-NEXT: [[TMP23:%.*]] = bitcast <256 x i32> [[TMP22]] to x86_amx +; CHECK-NEXT: [[TMP23:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP22]]) ; CHECK-NEXT: br label [[TILELOAD_SCALARIZE_ROWS_HEADER17:%.*]] ; CHECK: tileload.scalarize.rows.header17: ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_IV20:%.*]] = phi i16 [ 0, [[CONTINUE1]] ], [ [[TILELOAD_SCALARIZE_ROWS_STEP21:%.*]], [[TILELOAD_SCALARIZE_ROWS_LATCH19:%.*]] ] @@ -99,7 +99,7 @@ ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND22:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP21]], 4 ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND22]], label [[TILELOAD_SCALARIZE_ROWS_HEADER17]], label [[CONTINUE16:%.*]] ; CHECK: continue16: -; CHECK-NEXT: [[TMP35:%.*]] = bitcast <256 x i32> [[TMP34]] to x86_amx +; CHECK-NEXT: [[TMP35:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP34]]) ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]] ; CHECK: tiledpbssd.scalarize.rows.header: ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE16]] ], [ [[TILEDPBSSD_SCALARIZE_ROWS_STEP:%.*]], [[TILEDPBSSD_SCALARIZE_ROWS_LATCH:%.*]] ] @@ -153,7 +153,7 @@ ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], 4 ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE31:%.*]] ; CHECK: continue31: -; CHECK-NEXT: [[TMP55:%.*]] = bitcast <256 x i32> [[TMP54]] to x86_amx +; CHECK-NEXT: [[TMP55:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP54]]) ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]] ; CHECK: tilestore.scalarize.rows.header: ; CHECK-NEXT: [[TILESTORE_SCALARIZE_ROWS_IV:%.*]] = phi i16 [ 0, [[CONTINUE31]] ], [ [[TILESTORE_SCALARIZE_ROWS_STEP:%.*]], [[TILESTORE_SCALARIZE_ROWS_LATCH:%.*]] ] @@ -186,16 +186,16 @@ ; CHECK-NEXT: ret void ; entry: - %0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %C_mem, i64 16) - %1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %A_mem, i64 16) - %2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %B_mem, i64 16) - %3 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 4, i16 16, i16 16, x86_amx %0, x86_amx %1, x86_amx %2) - tail call void @llvm.x86.tilestored64.internal(i16 4, i16 16, ptr %C_mem, i64 16, x86_amx %3) + %0 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %C_mem, i64 16) + %1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %A_mem, i64 16) + %2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 4, i16 16, ptr %B_mem, i64 16) + %3 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 4, i16 16, i16 16, target("x86.AMX") %0, target("x86.AMX") %1, target("x86.AMX") %2) + tail call void @llvm.x86.tilestored64.internal(i16 4, i16 16, ptr %C_mem, i64 16, target("x86.AMX") %3) ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) attributes #0 = { noinline nounwind optnone } Index: llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll +++ llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll @@ -37,13 +37,13 @@ ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32> [[TMP11]] to x86_amx +; CHECK-NEXT: [[TMP12:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP11]]) ; CHECK-NEXT: store <256 x i32> [[TMP11]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride) - %vec = bitcast x86_amx %amx to <256 x i32> + %amx = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %amx) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -84,13 +84,13 @@ ; CHECK-NEXT: [[TILELOAD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILELOAD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILELOAD_SCALARIZE_ROWS_COND]], label [[TILELOAD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <256 x i32> [[TMP11]] to x86_amx +; CHECK-NEXT: [[TMP12:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP11]]) ; CHECK-NEXT: store <256 x i32> [[TMP11]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %amx = call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride) - %vec = bitcast x86_amx %amx to <256 x i32> + %amx = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %amx) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -98,9 +98,9 @@ define dso_local void @test_amx_dpbssd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 { ; CHECK-LABEL: @test_amx_dpbssd( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx -; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx -; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx +; CHECK-NEXT: [[A_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[A:%.*]]) +; CHECK-NEXT: [[B_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[B:%.*]]) +; CHECK-NEXT: [[C_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[C:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2 ; CHECK-NEXT: br label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER:%.*]] @@ -156,16 +156,16 @@ ; CHECK-NEXT: [[TILEDPBSSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSSD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILEDPBSSD_SCALARIZE_ROWS_COND]], label [[TILEDPBSSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx +; CHECK-NEXT: [[TMP21:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP20]]) ; CHECK-NEXT: store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %a.amx = bitcast <256 x i32> %a to x86_amx - %b.amx = bitcast <256 x i32> %b to x86_amx - %c.amx = bitcast <256 x i32> %c to x86_amx - %acc = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx) - %vec = bitcast x86_amx %acc to <256 x i32> + %a.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %b.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %b) + %c.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %c) + %acc = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %k, target("x86.AMX") %c.amx, target("x86.AMX") %a.amx, target("x86.AMX") %b.amx) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %acc) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -173,9 +173,9 @@ define dso_local void @test_amx_dpbsud(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 { ; CHECK-LABEL: @test_amx_dpbsud( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx -; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx -; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx +; CHECK-NEXT: [[A_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[A:%.*]]) +; CHECK-NEXT: [[B_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[B:%.*]]) +; CHECK-NEXT: [[C_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[C:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2 ; CHECK-NEXT: br label [[TILEDPBSUD_SCALARIZE_ROWS_HEADER:%.*]] @@ -231,16 +231,16 @@ ; CHECK-NEXT: [[TILEDPBSUD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBSUD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILEDPBSUD_SCALARIZE_ROWS_COND]], label [[TILEDPBSUD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx +; CHECK-NEXT: [[TMP21:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP20]]) ; CHECK-NEXT: store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %a.amx = bitcast <256 x i32> %a to x86_amx - %b.amx = bitcast <256 x i32> %b to x86_amx - %c.amx = bitcast <256 x i32> %c to x86_amx - %acc = call x86_amx @llvm.x86.tdpbsud.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx) - %vec = bitcast x86_amx %acc to <256 x i32> + %a.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %b.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %b) + %c.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %c) + %acc = call target("x86.AMX") @llvm.x86.tdpbsud.internal(i16 %row, i16 %col, i16 %k, target("x86.AMX") %c.amx, target("x86.AMX") %a.amx, target("x86.AMX") %b.amx) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %acc) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -248,9 +248,9 @@ define dso_local void @test_amx_dpbusd(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 { ; CHECK-LABEL: @test_amx_dpbusd( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx -; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx -; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx +; CHECK-NEXT: [[A_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[A:%.*]]) +; CHECK-NEXT: [[B_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[B:%.*]]) +; CHECK-NEXT: [[C_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[C:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2 ; CHECK-NEXT: br label [[TILEDPBUSD_SCALARIZE_ROWS_HEADER:%.*]] @@ -306,16 +306,16 @@ ; CHECK-NEXT: [[TILEDPBUSD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBUSD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILEDPBUSD_SCALARIZE_ROWS_COND]], label [[TILEDPBUSD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx +; CHECK-NEXT: [[TMP21:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP20]]) ; CHECK-NEXT: store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %a.amx = bitcast <256 x i32> %a to x86_amx - %b.amx = bitcast <256 x i32> %b to x86_amx - %c.amx = bitcast <256 x i32> %c to x86_amx - %acc = call x86_amx @llvm.x86.tdpbusd.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx) - %vec = bitcast x86_amx %acc to <256 x i32> + %a.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %b.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %b) + %c.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %c) + %acc = call target("x86.AMX") @llvm.x86.tdpbusd.internal(i16 %row, i16 %col, i16 %k, target("x86.AMX") %c.amx, target("x86.AMX") %a.amx, target("x86.AMX") %b.amx) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %acc) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -323,9 +323,9 @@ define dso_local void @test_amx_dpbuud(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 { ; CHECK-LABEL: @test_amx_dpbuud( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx -; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx -; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx +; CHECK-NEXT: [[A_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[A:%.*]]) +; CHECK-NEXT: [[B_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[B:%.*]]) +; CHECK-NEXT: [[C_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[C:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2 ; CHECK-NEXT: br label [[TILEDPBUUD_SCALARIZE_ROWS_HEADER:%.*]] @@ -381,16 +381,16 @@ ; CHECK-NEXT: [[TILEDPBUUD_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBUUD_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILEDPBUUD_SCALARIZE_ROWS_COND]], label [[TILEDPBUUD_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <256 x i32> [[TMP20]] to x86_amx +; CHECK-NEXT: [[TMP21:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP20]]) ; CHECK-NEXT: store <256 x i32> [[TMP20]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %a.amx = bitcast <256 x i32> %a to x86_amx - %b.amx = bitcast <256 x i32> %b to x86_amx - %c.amx = bitcast <256 x i32> %c to x86_amx - %acc = call x86_amx @llvm.x86.tdpbuud.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx) - %vec = bitcast x86_amx %acc to <256 x i32> + %a.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %b.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %b) + %c.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %c) + %acc = call target("x86.AMX") @llvm.x86.tdpbuud.internal(i16 %row, i16 %col, i16 %k, target("x86.AMX") %c.amx, target("x86.AMX") %a.amx, target("x86.AMX") %b.amx) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %acc) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -398,9 +398,9 @@ define dso_local void @test_amx_dpbf16ps(i16 signext %row, i16 signext %col, i16 signext %k, <256 x i32> %c, <256 x i32> %a, <256 x i32> %b, ptr %vptr) #0 { ; CHECK-LABEL: @test_amx_dpbf16ps( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A_AMX:%.*]] = bitcast <256 x i32> [[A:%.*]] to x86_amx -; CHECK-NEXT: [[B_AMX:%.*]] = bitcast <256 x i32> [[B:%.*]] to x86_amx -; CHECK-NEXT: [[C_AMX:%.*]] = bitcast <256 x i32> [[C:%.*]] to x86_amx +; CHECK-NEXT: [[A_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[A:%.*]]) +; CHECK-NEXT: [[B_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[B:%.*]]) +; CHECK-NEXT: [[C_AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[C:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i16 [[K:%.*]], 2 ; CHECK-NEXT: br label [[TILEDPBF16PS_SCALARIZE_ROWS_HEADER:%.*]] @@ -459,16 +459,16 @@ ; CHECK-NEXT: [[TILEDPBF16PS_SCALARIZE_ROWS_COND:%.*]] = icmp ne i16 [[TILEDPBF16PS_SCALARIZE_ROWS_STEP]], [[ROW:%.*]] ; CHECK-NEXT: br i1 [[TILEDPBF16PS_SCALARIZE_ROWS_COND]], label [[TILEDPBF16PS_SCALARIZE_ROWS_HEADER]], label [[CONTINUE:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[TMP24:%.*]] = bitcast <256 x i32> [[TMP23]] to x86_amx +; CHECK-NEXT: [[TMP24:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[TMP23]]) ; CHECK-NEXT: store <256 x i32> [[TMP23]], ptr [[VPTR:%.*]], align 64 ; CHECK-NEXT: ret void ; entry: - %a.amx = bitcast <256 x i32> %a to x86_amx - %b.amx = bitcast <256 x i32> %b to x86_amx - %c.amx = bitcast <256 x i32> %c to x86_amx - %acc = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %row, i16 %col, i16 %k, x86_amx %c.amx, x86_amx %a.amx, x86_amx %b.amx) - %vec = bitcast x86_amx %acc to <256 x i32> + %a.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %b.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %c.amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %acc = call target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16 %row, i16 %col, i16 %k, target("x86.AMX") %c.amx, target("x86.AMX") %a.amx, target("x86.AMX") %b.amx) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %acc) store <256 x i32> %vec, ptr %vptr, align 64 ret void } @@ -476,7 +476,7 @@ define dso_local void @test_amx_store(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr, <256 x i32> %vec) #0 { ; CHECK-LABEL: @test_amx_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AMX:%.*]] = bitcast <256 x i32> [[VEC:%.*]] to x86_amx +; CHECK-NEXT: [[AMX:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[VEC:%.*]]) ; CHECK-NEXT: [[TMP0:%.*]] = lshr i16 [[COL:%.*]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[STRIDE:%.*]], 2 ; CHECK-NEXT: br label [[TILESTORE_SCALARIZE_ROWS_HEADER:%.*]] @@ -511,8 +511,8 @@ ; CHECK-NEXT: ret void ; entry: - %amx = bitcast <256 x i32> %vec to x86_amx - call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride, x86_amx %amx) + %amx = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %vec) + call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr %ptr, i64 %stride, target("x86.AMX") %amx) ret void } @@ -523,19 +523,21 @@ ; CHECK-NEXT: ret void ; entry: - %amx = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col) - %vec = bitcast x86_amx %amx to <256 x i32> + %amx = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 %row, i16 %col) + %vec = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %amx) store <256 x i32> %vec, ptr %vptr, align 64 ret void } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbsud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbusd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbuud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32>) attributes #0 = { noinline nounwind optnone } Index: llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll +++ llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll @@ -64,7 +64,7 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: - %t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 64) + %t1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 64) br i1 undef, label %loop.header, label %exit loop.header: @@ -73,10 +73,10 @@ br label %loop.body loop.body: - %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) - %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) - %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) - tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, x86_amx %t4) + %t2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) + %t3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) + %t4 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, target("x86.AMX") %t1, target("x86.AMX") %t2, target("x86.AMX") %t3) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, target("x86.AMX") %t4) br label %loop.latch loop.latch: @@ -140,7 +140,7 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: - %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %t1 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 8) br i1 undef, label %loop.header, label %exit loop.header: @@ -149,10 +149,10 @@ br label %loop.body loop.body: - %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) - %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) - %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) - tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, x86_amx %t4) + %t2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) + %t3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) + %t4 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, target("x86.AMX") %t1, target("x86.AMX") %t2, target("x86.AMX") %t3) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, target("x86.AMX") %t4) br label %loop.latch loop.latch: @@ -165,7 +165,7 @@ } declare dso_local void @foo() -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-sched.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-sched.ll +++ llvm/test/CodeGen/X86/AMX/amx-sched.ll @@ -5,11 +5,11 @@ ; CHECK-LABEL: test_shape_sched: ; CHECK: ldtilecfg ; CHECK-NOT: movw - %c1 = bitcast <256 x i32> %c to x86_amx - %a1 = bitcast <256 x i32> %a to x86_amx - %b1 = bitcast <256 x i32> %b to x86_amx - %t = call x86_amx @llvm.x86.tdpbssd.internal(i16 %m, i16 %n, i16 %k, x86_amx %c1, x86_amx %a1, x86_amx %b1) - %res = bitcast x86_amx %t to <256 x i32> + %c1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %c) + %a1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %a) + %b1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %b) + %t = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %c1, target("x86.AMX") %a1, target("x86.AMX") %b1) + %res = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t) ret <256 x i32> %res } @@ -19,13 +19,15 @@ ; CHECK: ldtilecfg ; CHECK-NOT: movw %aa = lshr i16 %k, 2 - %c1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %c, i64 64) - %a1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %k, ptr %a, i64 64) - %b1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %aa, i16 %n, ptr %b, i64 64) - %t = call x86_amx @llvm.x86.tdpbssd.internal(i16 %m, i16 %n, i16 %k, x86_amx %c1, x86_amx %a1, x86_amx %b1) - %res = bitcast x86_amx %t to <256 x i32> + %c1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %c, i64 64) + %a1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %m, i16 %k, ptr %a, i64 64) + %b1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %aa, i16 %n, ptr %b, i64 64) + %t = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %c1, target("x86.AMX") %a1, target("x86.AMX") %b1) + %res = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t) ret <256 x i32> %res } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32>) \ No newline at end of file Index: llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -88,26 +88,26 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq - %c = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) + %c = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32) br i1 undef, label %if.true, label %if.false if.true: - %a1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) - %b1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - %d1 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %c, x86_amx %a1, x86_amx %b1) + %a1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %b1 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + %d1 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %c, target("x86.AMX") %a1, target("x86.AMX") %b1) tail call void (...) @foo() br label %exit if.false: - %a2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) - %b2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) - %d2 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %c, x86_amx %a2, x86_amx %b2) + %a2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %b2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 1024), i64 32) + %d2 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %c, target("x86.AMX") %a2, target("x86.AMX") %b2) tail call void (...) @foo() - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %d2) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %d2) br label %exit exit: - %d = phi x86_amx [ %d1, %if.true ], [ %d2, %if.false ] - %a = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) - %res = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %c, x86_amx %d, x86_amx %a) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, x86_amx %res) + %d = phi target("x86.AMX") [ %d1, %if.true ], [ %d2, %if.false ] + %a = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %0, i16 8, ptr @buf, i64 32) + %res = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, target("x86.AMX") %c, target("x86.AMX") %d, target("x86.AMX") %a) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32, target("x86.AMX") %res) ret void } @@ -166,21 +166,21 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: - %t5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %t5 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 8) br i1 undef, label %loop.header, label %exit loop.header: %ivphi = phi i16 [0, %entry], [%iv, %loop.latch] - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, x86_amx %t5) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, target("x86.AMX") %t5) call void (...) @foo() br label %loop.body loop.body: - %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) - %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) - %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) - tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, x86_amx %t4) + %t1 = tail call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 8) + %t2 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) + %t3 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 32) + %t4 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, target("x86.AMX") %t1, target("x86.AMX") %t2, target("x86.AMX") %t3) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %buf, i64 32, target("x86.AMX") %t4) br label %loop.latch loop.latch: @@ -194,7 +194,7 @@ declare dso_local void @foo(...) nounwind -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-spill.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-spill.ll +++ llvm/test/CodeGen/X86/AMX/amx-spill.ll @@ -60,40 +60,40 @@ ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) - %6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) - %7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) - %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) - %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) - %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) + %4 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) + %5 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) + %6 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) + %7 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) + %8 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) + %9 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) + %10 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %2, i16 %2, ptr @buf, i64 32) %11 = icmp eq i32 %0, 0 br i1 %11, label %16, label %12 12: ; preds = %3 - %13 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, ptr @buf, i64 32) - %14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) - %15 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) + %13 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, ptr @buf, i64 32) + %14 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) + %15 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf, i64 32) br label %20 16: ; preds = %3 - %17 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, ptr @buf2, i64 32) - %18 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf2, i64 32) - %19 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf2, i64 32) + %17 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %1, ptr @buf2, i64 32) + %18 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf2, i64 32) + %19 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %1, i16 %2, ptr @buf2, i64 32) br label %20 20: ; preds = %16, %12 - %21 = phi x86_amx [ %17, %16 ], [ %13, %12 ] - %22 = phi x86_amx [ %18, %16 ], [ %14, %12 ] - %23 = phi x86_amx [ %19, %16 ], [ %15, %12 ] - %24 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %1, x86_amx %23, x86_amx %21, x86_amx %22) - %25 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, x86_amx %6, x86_amx %24, x86_amx %5) - %26 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, x86_amx %8, x86_amx %25, x86_amx %7) - %27 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %2, i16 %2, i16 %2, x86_amx %10, x86_amx %26, x86_amx %9) - tail call void @llvm.x86.tilestored64.internal(i16 %2, i16 %2, ptr @buf, i64 32, x86_amx %27) + %21 = phi target("x86.AMX") [ %17, %16 ], [ %13, %12 ] + %22 = phi target("x86.AMX") [ %18, %16 ], [ %14, %12 ] + %23 = phi target("x86.AMX") [ %19, %16 ], [ %15, %12 ] + %24 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %1, target("x86.AMX") %23, target("x86.AMX") %21, target("x86.AMX") %22) + %25 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, target("x86.AMX") %6, target("x86.AMX") %24, target("x86.AMX") %5) + %26 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %1, i16 %2, i16 %2, target("x86.AMX") %8, target("x86.AMX") %25, target("x86.AMX") %7) + %27 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %2, i16 %2, i16 %2, target("x86.AMX") %10, target("x86.AMX") %26, target("x86.AMX") %9) + tail call void @llvm.x86.tilestored64.internal(i16 %2, i16 %2, ptr @buf, i64 32, target("x86.AMX") %27) ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -28,26 +28,26 @@ ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) - %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) - %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) - %d0 = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) - %d1 = call x86_amx @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, x86_amx %d0, x86_amx %a, x86_amx %b) - %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) - %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) - %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) - %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride) - call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, x86_amx %d4) + %c = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 8) + %a = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) + %b = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %base, i64 %stride) + %d0 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, target("x86.AMX") %c, target("x86.AMX") %a, target("x86.AMX") %b) + %d1 = call target("x86.AMX") @llvm.x86.tdpbsud.internal(i16 8, i16 8, i16 8, target("x86.AMX") %d0, target("x86.AMX") %a, target("x86.AMX") %b) + %d2 = call target("x86.AMX") @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, target("x86.AMX") %d1, target("x86.AMX") %a, target("x86.AMX") %b) + %d3 = call target("x86.AMX") @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, target("x86.AMX") %d2, target("x86.AMX") %a, target("x86.AMX") %b) + %d4 = call target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, target("x86.AMX") %d3, target("x86.AMX") %a, target("x86.AMX") %b) + %e = call target("x86.AMX") @llvm.x86.tileloaddt164.internal(i16 8, i16 8, ptr %base, i64 %stride) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, ptr %pointer, i64 %stride, target("x86.AMX") %d4) ret void } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tileloaddt164.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbsud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbusd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbuud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/amx-type.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-type.ll +++ llvm/test/CodeGen/X86/AMX/amx-type.ll @@ -6,27 +6,27 @@ @buf = dso_local global [1024 x i8] zeroinitializer, align 64 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 64 -; test bitcast x86_amx to <256 x i32> +; test bitcast target("x86.AMX") to <256 x i32> define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) { ; CHECK-LABEL: @test_user_empty( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) ; CHECK-NEXT: ret void ; entry: - %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) - %t2 = bitcast x86_amx %t1 to <256 x i32> + %t1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t1) ret void } -; test bitcast <256 x i32> to x86_amx +; test bitcast <256 x i32> to target("x86.AMX") define dso_local void @test_user_empty2(<256 x i32> %in) { ; CHECK-LABEL: @test_user_empty2( ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void ; entry: - %t = bitcast <256 x i32> %in to x86_amx + %t = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %in) ret void } @@ -34,28 +34,28 @@ ; CHECK-LABEL: @test_amx_load_bitcast( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64 -; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[IN]], i64 64) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[IN]], i64 64) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], target("x86.AMX") [[TMP0]]) ; CHECK-NEXT: ret <256 x i32> [[T1]] ; entry: %t1 = load <256 x i32>, ptr %in, align 64 - %t2 = bitcast <256 x i32> %t1 to x86_amx - call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, target("x86.AMX") %t2) ret <256 x i32> %t1 } define dso_local <256 x i32> @test_amx_bitcast_store(ptr %out, i16 %m, i16 %n, ptr%buf, i64 %s) { ; CHECK-LABEL: @test_amx_bitcast_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 64, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 64, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP0:%.*]] = load <256 x i32>, ptr [[OUT]], align 1024 ; CHECK-NEXT: ret <256 x i32> [[TMP0]] ; entry: - %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s) - %t2 = bitcast x86_amx %t1 to <256 x i32> + %t1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t1) store <256 x i32> %t2, ptr %out ret <256 x i32> %t2 } @@ -66,14 +66,14 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024 -; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[TMP0]], i64 64) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP1]]) +; CHECK-NEXT: [[TMP1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[TMP0]], i64 64) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], target("x86.AMX") [[TMP1]]) ; CHECK-NEXT: ret void ; entry: %add = add <256 x i32> %y, %x - %t = bitcast <256 x i32> %add to x86_amx - call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t) + %t = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %add) + call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, target("x86.AMX") %t) ret void } @@ -81,15 +81,15 @@ ; CHECK-LABEL: @test_src_add2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 64, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 64, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[TMP1]], [[X:%.*]] ; CHECK-NEXT: ret void ; entry: - %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s) - %t2 = bitcast x86_amx %t1 to <256 x i32> + %t1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t1) %add = add <256 x i32> %t2, %x ret void } @@ -127,9 +127,9 @@ ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32 ; CHECK-NEXT: [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32 -; CHECK-NEXT: [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP9]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, target("x86.AMX") [[TMP9]]) ; CHECK-NEXT: ret void ; %4 = load i16, ptr %0, align 64 @@ -137,8 +137,8 @@ %6 = load i16, ptr %5, align 2 %7 = shl i64 %2, 32 %8 = ashr exact i64 %7, 32 - %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8) - %10 = bitcast x86_amx %9 to <256 x i32> + %9 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8) + %10 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %9) %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 store <256 x i32> %10, ptr %11, align 64 ret void @@ -153,13 +153,13 @@ ; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = udiv i16 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64) +; CHECK-NEXT: [[TMP11:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP12]], i64 64) +; CHECK-NEXT: [[TMP13:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP12]], i64 64) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP14]], i64 64) -; CHECK-NEXT: [[TMP16:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP11]], x86_amx [[TMP13]], x86_amx [[TMP15]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP16]]) +; CHECK-NEXT: [[TMP15:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP14]], i64 64) +; CHECK-NEXT: [[TMP16:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], target("x86.AMX") [[TMP11]], target("x86.AMX") [[TMP13]], target("x86.AMX") [[TMP15]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, target("x86.AMX") [[TMP16]]) ; CHECK-NEXT: ret void ; %4 = load i16, ptr %1, align 64 @@ -169,15 +169,15 @@ %8 = load i16, ptr %7, align 2 %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 %10 = load <256 x i32>, ptr %9, align 64 - %11 = bitcast <256 x i32> %10 to x86_amx + %11 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %10) %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2 %13 = load <256 x i32>, ptr %12, align 64 - %14 = bitcast <256 x i32> %13 to x86_amx + %14 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %13) %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 %16 = load <256 x i32>, ptr %15, align 64 - %17 = bitcast <256 x i32> %16 to x86_amx - %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17) - %19 = bitcast x86_amx %18 to <256 x i32> + %17 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %16) + %18 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, target("x86.AMX") %11, target("x86.AMX") %14, target("x86.AMX") %17) + %19 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %18) store <256 x i32> %19, ptr %9, align 64 ret void } @@ -185,21 +185,21 @@ define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbsud( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = bitcast <256 x i32> %t0 to x86_amx + %t1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = bitcast <256 x i32> %t2 to x86_amx + %t3 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = bitcast <256 x i32> %t4 to x86_amx - %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = bitcast x86_amx %t6 to <256 x i32> + %t5 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -207,21 +207,21 @@ define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbusd( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = bitcast <256 x i32> %t0 to x86_amx + %t1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = bitcast <256 x i32> %t2 to x86_amx + %t3 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = bitcast <256 x i32> %t4 to x86_amx - %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = bitcast x86_amx %t6 to <256 x i32> + %t5 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -229,21 +229,21 @@ define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbuud( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = bitcast <256 x i32> %t0 to x86_amx + %t1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = bitcast <256 x i32> %t2 to x86_amx + %t3 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = bitcast <256 x i32> %t4 to x86_amx - %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = bitcast x86_amx %t6 to <256 x i32> + %t5 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -251,21 +251,21 @@ define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbf16ps( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = bitcast <256 x i32> %t0 to x86_amx + %t1 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = bitcast <256 x i32> %t2 to x86_amx + %t3 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = bitcast <256 x i32> %t4 to x86_amx - %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = bitcast x86_amx %t6 to <256 x i32> + %t5 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -276,10 +276,10 @@ ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 64) +; CHECK-NEXT: [[TMP8:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 64) ; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP1:%.*]], 32 ; CHECK-NEXT: [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 32 -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP10]], x86_amx [[TMP8]]) +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP10]], target("x86.AMX") [[TMP8]]) ; CHECK-NEXT: ret void ; %4 = load i16, ptr %2, align 64 @@ -287,17 +287,19 @@ %6 = load i16, ptr %5, align 2 %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 %8 = load <256 x i32>, ptr %7, align 64 - %9 = bitcast <256 x i32> %8 to x86_amx + %9 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %8) %10 = shl i64 %1, 32 %11 = ashr exact i64 %10, 32 - tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9) + tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, target("x86.AMX") %9) ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbsud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbusd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbuud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32>) \ No newline at end of file Index: llvm/test/CodeGen/X86/AMX/amx-zero-config.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/amx-zero-config.ll +++ llvm/test/CodeGen/X86/AMX/amx-zero-config.ll @@ -184,10 +184,10 @@ ; SSE2-O0-NEXT: tilerelease ; SSE2-O0-NEXT: retq entry: - %t = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, x86_amx %t) + %t = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr %buf, i64 1024, target("x86.AMX") %t) ret void } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll +++ llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll @@ -4,15 +4,15 @@ define void @combine_amx_cast_inside_bb() { ; CHECK-LABEL: @combine_amx_cast_inside_bb( ; CHECK-NEXT: wrapper_entry: -; CHECK-NEXT: [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP0]]) ; CHECK-NEXT: ret void ; wrapper_entry: - %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) - %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %tmp) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %1) + %0 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %0) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %tmp) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %1) ret void } @@ -24,39 +24,39 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512 -; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) +; CHECK-NEXT: [[TMP5:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP1]], align 1024 -; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56) +; CHECK-NEXT: [[TMP7:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024 -; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) -; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]]) +; CHECK-NEXT: [[TMP9:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) +; CHECK-NEXT: [[TMP10:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP5]], target("x86.AMX") [[TMP7]], target("x86.AMX") [[TMP9]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] ; CHECK: for.cond.cleanup.i.i: -; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP11:%.*]] = phi target("x86.AMX") [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP11]]) ; CHECK-NEXT: ret void ; wrapper_entry: - %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) - %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + %0 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %0) br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) - %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) - %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) - %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %1, target("x86.AMX") %2, target("x86.AMX") %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %4) br label %for.cond.cleanup.i.i for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] - %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) + %6 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %6) ret void } @@ -74,20 +74,20 @@ ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP4]], align 512 -; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40) +; CHECK-NEXT: [[TMP7:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40) ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP3]], align 1024 -; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP3]], i64 56) +; CHECK-NEXT: [[TMP9:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP3]], i64 56) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP2]], align 1024 -; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP2]], i64 40) -; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP7]], x86_amx [[TMP9]], x86_amx [[TMP11]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP12]]) +; CHECK-NEXT: [[TMP11:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP2]], i64 40) +; CHECK-NEXT: [[TMP12:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP7]], target("x86.AMX") [[TMP9]], target("x86.AMX") [[TMP11]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, target("x86.AMX") [[TMP12]]) ; CHECK-NEXT: [[TMP14:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] ; CHECK: for.cond.cleanup.i.i: ; CHECK-NEXT: [[EVILPHI:%.*]] = phi <110 x i32> [ [[TMP5]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP14]], [[FOR_BODY_I_LR_PH_I]] ] ; CHECK-NEXT: store <110 x i32> [[EVILPHI]], ptr [[TMP0]], align 512 -; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP16]]) +; CHECK-NEXT: [[TMP16:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP16]]) ; CHECK-NEXT: ret void ; wrapper_entry: @@ -95,17 +95,17 @@ br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) - %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) - %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) - %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %1, target("x86.AMX") %2, target("x86.AMX") %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %4) br label %for.cond.cleanup.i.i for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %evilphi = phi <110 x i32> [ %0, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] - %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) + %6 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %6) ret void } @@ -120,26 +120,26 @@ ; CHECK-NEXT: [[TMP3:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP4:%.*]] = alloca <110 x i32>, align 64 ; CHECK-NEXT: [[TMP5:%.*]] = alloca <110 x i32>, align 64 -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, x86_amx [[TMP6]]) +; CHECK-NEXT: [[TMP6:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP5]], i64 40, target("x86.AMX") [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = load <110 x i32>, ptr [[TMP5]], align 512 ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP4]], align 512 -; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40) +; CHECK-NEXT: [[TMP10:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP4]], i64 40) ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP3]], align 1024 -; CHECK-NEXT: [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP3]], i64 56) +; CHECK-NEXT: [[TMP12:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP3]], i64 56) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP2]], align 1024 -; CHECK-NEXT: [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP2]], i64 40) -; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, x86_amx [[TMP15]]) +; CHECK-NEXT: [[TMP14:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP2]], i64 40) +; CHECK-NEXT: [[TMP15:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP10]], target("x86.AMX") [[TMP12]], target("x86.AMX") [[TMP14]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP1]], i64 40, target("x86.AMX") [[TMP15]]) ; CHECK-NEXT: [[TMP17:%.*]] = load <110 x i32>, ptr [[TMP1]], align 512 ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] ; CHECK: for.cond.cleanup.i.i: ; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ] ; CHECK-NEXT: store <110 x i32> [[GOODPHI]], ptr [[TMP0]], align 512 -; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP19]]) +; CHECK-NEXT: [[TMP19:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP19]]) ; CHECK-NEXT: br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] ; CHECK: exit: ; CHECK-NEXT: [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ] @@ -147,22 +147,22 @@ ; CHECK-NEXT: ret void ; wrapper_entry: - %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) - %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + %0 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %0) br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) - %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) - %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) - %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %1, target("x86.AMX") %2, target("x86.AMX") %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %4) br i1 undef, label %for.cond.cleanup.i.i, label %exit for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] - %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) + %6 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %6) br i1 undef, label %exit, label %for.body.i.lr.ph.i exit: %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ] @@ -176,37 +176,37 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 11, i16 40) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 11, i16 40) ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512 -; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) +; CHECK-NEXT: [[TMP5:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP1]], align 1024 -; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56) +; CHECK-NEXT: [[TMP7:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024 -; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) -; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]]) +; CHECK-NEXT: [[TMP9:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) +; CHECK-NEXT: [[TMP10:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP5]], target("x86.AMX") [[TMP7]], target("x86.AMX") [[TMP9]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] ; CHECK: for.cond.cleanup.i.i: -; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP11:%.*]] = phi target("x86.AMX") [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP11]]) ; CHECK-NEXT: ret void ; wrapper_entry: br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry - %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) - %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %3 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %0, x86_amx %1, x86_amx %2) - %4 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %3) + %0 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %3 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %0, target("x86.AMX") %1, target("x86.AMX") %2) + %4 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %3) br label %for.cond.cleanup.i.i for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %evilphi = phi <110 x i32> [ undef, %wrapper_entry ], [ %4, %for.body.i.lr.ph.i ] - %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %5) + %5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %5) ret void } @@ -219,48 +219,48 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP1:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <110 x i32>, align 64 -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP2]], align 512 -; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) +; CHECK-NEXT: [[TMP5:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP2]], i64 40) ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP1]], align 1024 -; CHECK-NEXT: [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56) +; CHECK-NEXT: [[TMP7:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP1]], i64 56) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024 -; CHECK-NEXT: [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) -; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]]) +; CHECK-NEXT: [[TMP9:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) +; CHECK-NEXT: [[TMP10:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP5]], target("x86.AMX") [[TMP7]], target("x86.AMX") [[TMP9]]) ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]] ; CHECK: for.cond.cleanup.i.i: -; CHECK-NEXT: [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP11:%.*]] = phi target("x86.AMX") [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP11]]) ; CHECK-NEXT: br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]] ; CHECK: exit: -; CHECK-NEXT: [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP12]]) +; CHECK-NEXT: [[TMP12:%.*]] = phi target("x86.AMX") [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP12]]) ; CHECK-NEXT: ret void ; wrapper_entry: - %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) - %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + %0 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %0) br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) - %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) - %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) - %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %1, target("x86.AMX") %2, target("x86.AMX") %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %4) br i1 undef, label %for.cond.cleanup.i.i, label %exit for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ] - %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) + %6 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %6) br i1 undef, label %exit, label %for.body.i.lr.ph.i exit: %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ] - %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7) + %7 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %7) ret void } @@ -272,59 +272,59 @@ ; CHECK-NEXT: [[TMP1:%.*]] = alloca <560 x i8>, align 64 ; CHECK-NEXT: [[TMP2:%.*]] = alloca <616 x i8>, align 64 ; CHECK-NEXT: [[TMP3:%.*]] = alloca <110 x i32>, align 64 -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: store <110 x i32> undef, ptr [[TMP3]], align 512 -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP3]], i64 40) +; CHECK-NEXT: [[TMP6:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr [[TMP3]], i64 40) ; CHECK-NEXT: store <616 x i8> undef, ptr [[TMP2]], align 1024 -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP2]], i64 56) +; CHECK-NEXT: [[TMP8:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr [[TMP2]], i64 56) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP1]], align 1024 -; CHECK-NEXT: [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP1]], i64 40) -; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, x86_amx [[TMP11]]) +; CHECK-NEXT: [[TMP10:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP1]], i64 40) +; CHECK-NEXT: [[TMP11:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP6]], target("x86.AMX") [[TMP8]], target("x86.AMX") [[TMP10]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr [[TMP0]], i64 40, target("x86.AMX") [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = load <110 x i32>, ptr [[TMP0]], align 512 ; CHECK-NEXT: br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ] +; CHECK-NEXT: [[TMP14:%.*]] = phi target("x86.AMX") [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ] ; CHECK-NEXT: [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP14]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP14]]) ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ] +; CHECK-NEXT: [[TMP15]] = phi target("x86.AMX") [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ] ; CHECK-NEXT: [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP15]]) ; CHECK-NEXT: br i1 undef, label [[BB2]], label [[EXIT:%.*]] ; CHECK: exit: -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP15]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP15]]) ; CHECK-NEXT: ret void ; wrapper_entry: - %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) - %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + %0 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %0) br label %bb1 bb1: ; preds = %wrapper_entry - %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) - %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) - %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3) - %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4) + %1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef) + %2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef) + %3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %4 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %1, target("x86.AMX") %2, target("x86.AMX") %3) + %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %4) br i1 undef, label %bb2, label %bb3 bb2: ; preds = %bb1, %wrapper_entry %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ] - %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %6) + %6 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %6) br label %bb3 bb3: %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ] - %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %7) + %7 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %7) br i1 undef, label %bb2, label %exit exit: - %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8) + %8 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %8) ret void } @@ -332,49 +332,49 @@ ; CHECK-LABEL: @eliminate_unused_phi_and_cast( ; CHECK-NEXT: wrapper_entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <560 x i8>, align 64 -; CHECK-NEXT: [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: [[TMP1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) ; CHECK-NEXT: br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]] ; CHECK: for.body.i.lr.ph.i: -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef) ; CHECK-NEXT: store <560 x i8> undef, ptr [[TMP0]], align 1024 -; CHECK-NEXT: [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) -; CHECK-NEXT: [[TMP6:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP2]], x86_amx [[TMP3]], x86_amx [[TMP5]]) +; CHECK-NEXT: [[TMP5:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr [[TMP0]], i64 40) +; CHECK-NEXT: [[TMP6:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]], target("x86.AMX") [[TMP5]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_I_I]] ; CHECK: for.cond.cleanup.i.i: -; CHECK-NEXT: [[TMP7:%.*]] = phi x86_amx [ [[TMP1]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY_I_LR_PH_I]] ] -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx [[TMP7]]) +; CHECK-NEXT: [[TMP7:%.*]] = phi target("x86.AMX") [ [[TMP1]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY_I_LR_PH_I]] ] +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") [[TMP7]]) ; CHECK-NEXT: ret void ; wrapper_entry: - %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) - %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0) + %0 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 40, ptr undef, i64 undef) + %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %0) br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i for.body.i.lr.ph.i: ; preds = %wrapper_entry - %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef) - %v1 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %1) - %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef) - %v2 = call <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx %2) - %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %v1) - %4 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> %v2) - %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) - %6 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %3, x86_amx %4, x86_amx %5) - %7 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %6) + %1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 11, i16 56, ptr undef, i64 undef) + %v1 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %1) + %2 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 14, i16 40, ptr undef, i64 undef) + %v2 = call <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(target("x86.AMX") %2) + %3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %v1) + %4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> %v2) + %5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef) + %6 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, target("x86.AMX") %3, target("x86.AMX") %4, target("x86.AMX") %5) + %7 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX") %6) br label %for.cond.cleanup.i.i for.cond.cleanup.i.i: ; preds = %for.body.i.lr.ph.i, %wrapper_entry %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %7, %for.body.i.lr.ph.i ] - %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) - call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, x86_amx %8) + %8 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi) + call void @llvm.x86.tilestored64.internal(i16 11, i16 40, ptr undef, i64 undef, target("x86.AMX") %8) ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx) -declare <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx) -declare x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32>) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) -declare x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8>) -declare x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8>) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(target("x86.AMX")) +declare <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32>) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8>) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8>) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) Index: llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll =================================================================== --- llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll +++ llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll @@ -6,27 +6,27 @@ @buf = dso_local global [1024 x i8] zeroinitializer, align 64 @buf2 = dso_local global [1024 x i8] zeroinitializer, align 64 -; test bitcast x86_amx to <256 x i32> +; test bitcast target("x86.AMX") to <256 x i32> define dso_local void @test_user_empty(i16 %m, i16 %n, ptr%buf, i64 %s) { ; CHECK-LABEL: @test_user_empty( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) ; CHECK-NEXT: ret void ; entry: - %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) ret void } -; test bitcast <256 x i32> to x86_amx +; test bitcast <256 x i32> to target("x86.AMX") define dso_local void @test_user_empty2(<256 x i32> %in) { ; CHECK-LABEL: @test_user_empty2( ; CHECK-NEXT: entry: ; CHECK-NEXT: ret void ; entry: - %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in) + %t = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in) ret void } @@ -37,14 +37,14 @@ ; CHECK-NEXT: [[T1:%.*]] = load <256 x i32>, ptr [[IN:%.*]], align 64 ; CHECK-NEXT: store <256 x i32> [[T1]], ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], target("x86.AMX") [[TMP2]]) ; CHECK-NEXT: ret <256 x i32> [[T1]] ; entry: %t1 = load <256 x i32>, ptr %in, align 64 - %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) - call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, target("x86.AMX") %t2) ret <256 x i32> %t1 } @@ -55,14 +55,14 @@ ; CHECK-NEXT: [[T1:%.*]] = load <225 x i32>, ptr [[IN:%.*]], align 64 ; CHECK-NEXT: store <225 x i32> [[T1]], ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[N:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], ptr [[TMP0]], i64 [[TMP1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[BUF:%.*]], i64 [[S:%.*]], target("x86.AMX") [[TMP2]]) ; CHECK-NEXT: ret <225 x i32> [[T1]] ; entry: %t1 = load <225 x i32>, ptr %in, align 64 - %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1) - call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, x86_amx %t2) + %t2 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1) + call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, ptr %buf, i64 %s, target("x86.AMX") %t2) ret <225 x i32> %t1 } @@ -70,16 +70,16 @@ ; CHECK-LABEL: @test_amx_bitcast_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], ptr [[BUF:%.*]], i64 [[S:%.*]]) ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[M]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[TMP0]], i64 [[TMP1]], target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 64, x86_amx [[T1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], ptr [[OUT:%.*]], i64 64, target("x86.AMX") [[T1]]) ; CHECK-NEXT: ret <256 x i32> [[TMP2]] ; entry: - %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) store <256 x i32> %t2, ptr %out ret <256 x i32> %t2 } @@ -91,14 +91,14 @@ ; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]] ; CHECK-NEXT: store <256 x i32> [[ADD]], ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[C:%.*]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP2]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[BUF:%.*]], i64 [[S:%.*]], target("x86.AMX") [[TMP2]]) ; CHECK-NEXT: ret void ; entry: %add = add <256 x i32> %y, %x - %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add) - call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, x86_amx %t) + %t = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add) + call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, ptr %buf, i64 %s, target("x86.AMX") %t) ret void } @@ -106,16 +106,16 @@ ; CHECK-LABEL: @test_src_add2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], ptr [[BUF:%.*]], i64 [[S:%.*]]) ; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[C]] to i64 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]], x86_amx [[T1]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], ptr [[TMP0]], i64 [[TMP1]], target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP2:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 ; CHECK-NEXT: [[ADD:%.*]] = add <256 x i32> [[TMP2]], [[X:%.*]] ; CHECK-NEXT: ret void ; entry: - %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, ptr %buf, i64 %s) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) %add = add <256 x i32> %t2, %x ret void } @@ -127,9 +127,9 @@ ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP2:%.*]], 32 ; CHECK-NEXT: [[TMP8:%.*]] = ashr exact i64 [[TMP7]], 32 -; CHECK-NEXT: [[TMP9:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP1:%.*]], i64 [[TMP8]]) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0]], i64 0, i32 2 -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP9]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, target("x86.AMX") [[TMP9]]) ; CHECK-NEXT: ret void ; %4 = load i16, ptr %0, align 64 @@ -137,8 +137,8 @@ %6 = load i16, ptr %5, align 2 %7 = shl i64 %2, 32 %8 = ashr exact i64 %7, 32 - %9 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8) - %10 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %9) + %9 = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 %4, i16 %6, ptr %1, i64 %8) + %10 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %9) %11 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 store <256 x i32> %10, ptr %11, align 64 ret void @@ -153,13 +153,13 @@ ; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[TMP7]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = udiv i16 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP0:%.*]], i64 0, i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64) +; CHECK-NEXT: [[TMP11:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP1]], i64 0, i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP12]], i64 64) +; CHECK-NEXT: [[TMP13:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP8]], ptr [[TMP12]], i64 64) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 -; CHECK-NEXT: [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP14]], i64 64) -; CHECK-NEXT: [[TMP16:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], x86_amx [[TMP11]], x86_amx [[TMP13]], x86_amx [[TMP15]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, x86_amx [[TMP16]]) +; CHECK-NEXT: [[TMP15:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP6]], ptr [[TMP14]], i64 64) +; CHECK-NEXT: [[TMP16:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 [[TMP4]], i16 [[TMP6]], i16 [[TMP8]], target("x86.AMX") [[TMP11]], target("x86.AMX") [[TMP13]], target("x86.AMX") [[TMP15]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP10]], i64 64, target("x86.AMX") [[TMP16]]) ; CHECK-NEXT: ret void ; %4 = load i16, ptr %1, align 64 @@ -169,15 +169,15 @@ %8 = load i16, ptr %7, align 2 %9 = getelementptr inbounds %struct.__tile_str, ptr %0, i64 0, i32 2 %10 = load <256 x i32>, ptr %9, align 64 - %11 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %10) + %11 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %10) %12 = getelementptr inbounds %struct.__tile_str, ptr %1, i64 0, i32 2 %13 = load <256 x i32>, ptr %12, align 64 - %14 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %13) + %14 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %13) %15 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 %16 = load <256 x i32>, ptr %15, align 64 - %17 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16) - %18 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, x86_amx %11, x86_amx %14, x86_amx %17) - %19 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %18) + %17 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %16) + %18 = tail call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 %4, i16 %6, i16 %8, target("x86.AMX") %11, target("x86.AMX") %14, target("x86.AMX") %17) + %19 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %18) store <256 x i32> %19, ptr %9, align 64 ret void } @@ -185,21 +185,21 @@ define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbsud( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) - %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + %t5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -207,21 +207,21 @@ define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbusd( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) - %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + %t5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -229,21 +229,21 @@ define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbuud( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) - %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + %t5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -251,21 +251,21 @@ define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, ptr %pc, ptr %pa, ptr %pb) { ; CHECK-LABEL: @__tile_dpbf16ps( ; CHECK-NEXT: [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) -; CHECK-NEXT: [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) -; CHECK-NEXT: [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) -; CHECK-NEXT: [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP4]], x86_amx [[TMP2]], x86_amx [[TMP3]]) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, x86_amx [[T6]]) +; CHECK-NEXT: [[TMP2:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], ptr [[PA:%.*]], i64 64) +; CHECK-NEXT: [[TMP3:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], ptr [[PB:%.*]], i64 64) +; CHECK-NEXT: [[TMP4:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], ptr [[PC:%.*]], i64 64) +; CHECK-NEXT: [[T6:%.*]] = tail call target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], target("x86.AMX") [[TMP4]], target("x86.AMX") [[TMP2]], target("x86.AMX") [[TMP3]]) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], ptr [[PC]], i64 64, target("x86.AMX") [[T6]]) ; CHECK-NEXT: ret void ; %t0 = load <256 x i32>, ptr %pa, align 64 - %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) + %t1 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0) %t2 = load <256 x i32>, ptr %pb, align 64 - %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) + %t3 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2) %t4 = load <256 x i32>, ptr %pc, align 64 - %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) - %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3) - %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6) + %t5 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4) + %t6 = tail call target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, target("x86.AMX") %t5, target("x86.AMX") %t1, target("x86.AMX") %t3) + %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t6) store <256 x i32> %t7, ptr %pc, align 64 ret void } @@ -276,10 +276,10 @@ ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], ptr [[TMP2]], i64 0, i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], ptr [[TMP2]], i64 0, i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 64) +; CHECK-NEXT: [[TMP8:%.*]] = call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP7]], i64 64) ; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP1:%.*]], 32 ; CHECK-NEXT: [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 32 -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP10]], x86_amx [[TMP8]]) +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[TMP4]], i16 [[TMP6]], ptr [[TMP0:%.*]], i64 [[TMP10]], target("x86.AMX") [[TMP8]]) ; CHECK-NEXT: ret void ; %4 = load i16, ptr %2, align 64 @@ -287,10 +287,10 @@ %6 = load i16, ptr %5, align 2 %7 = getelementptr inbounds %struct.__tile_str, ptr %2, i64 0, i32 2 %8 = load <256 x i32>, ptr %7, align 64 - %9 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %8) + %9 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %8) %10 = shl i64 %1, 32 %11 = ashr exact i64 %10, 32 - tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, x86_amx %9) + tail call void @llvm.x86.tilestored64.internal(i16 %4, i16 %6, ptr %0, i64 %11, target("x86.AMX") %9) ret void } @@ -300,8 +300,8 @@ ; CHECK-NEXT: [[TMP0:%.*]] = alloca <256 x i32>, align 64 ; CHECK-NEXT: br i1 undef, label [[L1:%.*]], label [[L2:%.*]] ; CHECK: l1: -; CHECK-NEXT: [[T1:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) -; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, x86_amx [[T1]]) +; CHECK-NEXT: [[T1:%.*]] = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) +; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 8, i16 32, ptr [[TMP0]], i64 32, target("x86.AMX") [[T1]]) ; CHECK-NEXT: [[TMP1:%.*]] = load <256 x i32>, ptr [[TMP0]], align 1024 ; CHECK-NEXT: br i1 undef, label [[L2]], label [[EXIT:%.*]] ; CHECK: l2: @@ -315,14 +315,14 @@ br i1 undef, label %l1, label %l2 l1: - %t1 = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 32) - %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1) + %t1 = call target("x86.AMX") @llvm.x86.tilezero.internal(i16 8, i16 32) + %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t1) br i1 undef, label %l2, label %exit l2: %t3 = phi <256 x i32> [ undef, %entry ], [ %t2, %l1 ] - %t4 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) - %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t4) + %t4 = call target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t3) + %t5 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX") %t4) store <256 x i32> %t5, ptr %buf br label %exit @@ -330,16 +330,16 @@ ret void } -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tilezero.internal(i16, i16) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbsud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbusd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbuud.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.tdpbf16ps.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) -declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) -declare x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>) -declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) -declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(x86_amx) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) +declare target("x86.AMX") @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>) +declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(target("x86.AMX")) +declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(target("x86.AMX")) Index: llvm/test/Transforms/InstCombine/X86/x86-amx-load-store.ll =================================================================== --- llvm/test/Transforms/InstCombine/X86/x86-amx-load-store.ll +++ llvm/test/Transforms/InstCombine/X86/x86-amx-load-store.ll @@ -6,14 +6,14 @@ ; CHECK-LABEL: @test_amx_load_store( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VEC:%.*]] = load <256 x i32>, ptr [[SRC:%.*]], align 64 -; CHECK-NEXT: [[BC:%.*]] = bitcast <256 x i32> [[VEC]] to x86_amx -; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 16, i16 16, ptr [[DST:%.*]], i64 64, x86_amx [[BC]]) +; CHECK-NEXT: [[BC:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[VEC]]) +; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 16, i16 16, ptr [[DST:%.*]], i64 64, target("x86.AMX") [[BC]]) ; CHECK-NEXT: ret void ; entry: %vec = load <256 x i32>, ptr %src, align 64 - %bc = bitcast <256 x i32> %vec to x86_amx - tail call void @llvm.x86.tilestored64.internal(i16 16, i16 16, ptr %dst, i64 64, x86_amx %bc) + %bc = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %vec) + tail call void @llvm.x86.tilestored64.internal(i16 16, i16 16, ptr %dst, i64 64, target("x86.AMX") %bc) ret void } @@ -21,17 +21,19 @@ define dso_local void @test_amx_load_store2(ptr %dst, ptr %src) { ; CHECK-LABEL: @test_amx_load_store2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[AMX:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr [[SRC:%.*]], i64 64) -; CHECK-NEXT: [[BC:%.*]] = bitcast x86_amx [[AMX]] to <256 x i32> +; CHECK-NEXT: [[AMX:%.*]] = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr [[SRC:%.*]], i64 64) +; CHECK-NEXT: [[BC:%.*]] = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") [[AMX]]) ; CHECK-NEXT: store <256 x i32> [[BC]], ptr [[DST:%.*]], align 1024 ; CHECK-NEXT: ret void ; entry: - %amx = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr %src, i64 64) - %bc = bitcast x86_amx %amx to <256 x i32> + %amx = tail call target("x86.AMX") @llvm.x86.tileloadd64.internal(i16 16, i16 16, ptr %src, i64 64) + %bc = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %amx) store <256 x i32> %bc, ptr %dst ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32>) \ No newline at end of file Index: llvm/test/Transforms/InstCombine/X86/x86-amx.ll =================================================================== --- llvm/test/Transforms/InstCombine/X86/x86-amx.ll +++ llvm/test/Transforms/InstCombine/X86/x86-amx.ll @@ -19,11 +19,11 @@ ; CHECK: for.body24: ; CHECK-NEXT: [[T6:%.*]] = load <256 x i32>, ptr [[ARRAYIDX29:%.*]], align 64 ; CHECK-NEXT: [[T7:%.*]] = load <256 x i32>, ptr [[ARRAYIDX35:%.*]], align 64 -; CHECK-NEXT: [[T8:%.*]] = bitcast <256 x i32> [[SUB_C_SROA_0_0]] to x86_amx -; CHECK-NEXT: [[T9:%.*]] = bitcast <256 x i32> [[T6]] to x86_amx -; CHECK-NEXT: [[T10:%.*]] = bitcast <256 x i32> [[T7]] to x86_amx -; CHECK-NEXT: [[T11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 1, i16 4, i16 4, x86_amx [[T8]], x86_amx [[T9]], x86_amx [[T10]]) -; CHECK-NEXT: [[T12]] = bitcast x86_amx [[T11]] to <256 x i32> +; CHECK-NEXT: [[T8:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[SUB_C_SROA_0_0]]) +; CHECK-NEXT: [[T9:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[T6]]) +; CHECK-NEXT: [[T10:%.*]] = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> [[T7]]) +; CHECK-NEXT: [[T11:%.*]] = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 1, i16 4, i16 4, target("x86.AMX") [[T8]], target("x86.AMX") [[T9]], target("x86.AMX") [[T10]]) +; CHECK-NEXT: [[T12]] = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") [[T11]]) ; CHECK-NEXT: br label [[FOR_COND18]] ; CHECK: exit: ; CHECK-NEXT: ret void @@ -48,17 +48,19 @@ for.body24: ; preds = %for.cond18 %t6 = load <256 x i32>, ptr %arrayidx29, align 64 %t7 = load <256 x i32>, ptr %arrayidx35, align 64 - %t8 = bitcast <256 x i32> %sub_c.sroa.0.0 to x86_amx - %t9 = bitcast <256 x i32> %t6 to x86_amx - %t10 = bitcast <256 x i32> %t7 to x86_amx - %t11 = call x86_amx @llvm.x86.tdpbssd.internal(i16 1, i16 4, i16 4, x86_amx %t8, x86_amx %t9, x86_amx %t10) #12 - %t12 = bitcast x86_amx %t11 to <256 x i32> + %t8 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %sub_c.sroa.0.0) + %t9 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t6) + %t10 = call target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32> %t7) + %t11 = call target("x86.AMX") @llvm.x86.tdpbssd.internal(i16 1, i16 4, i16 4, target("x86.AMX") %t8, target("x86.AMX") %t9, target("x86.AMX") %t10) #12 + %t12 = call <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX") %t11) br label %for.cond18 exit: ret void } -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) +declare target("x86.AMX") @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) +declare target("x86.AMX") @llvm.x86.tdpbssd.internal(i16, i16, i16, target("x86.AMX"), target("x86.AMX"), target("x86.AMX")) +declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, target("x86.AMX")) +declare <256 x i32> @llvm.x86.bitconvert.tile.to.vector(target("x86.AMX")) +declare target("x86.AMX") @llvm.x86.bitconvert.vector.to.tile(<256 x i32>) Index: llvm/test/Verifier/x86_amx1.ll =================================================================== --- llvm/test/Verifier/x86_amx1.ll +++ /dev/null @@ -1,4 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -@GV = dso_local global [10 x x86_amx] zeroinitializer, align 16 -; CHECK: invalid array element type Index: llvm/test/Verifier/x86_amx2.ll =================================================================== --- llvm/test/Verifier/x86_amx2.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -define dso_local void @f() { -entry: - %a.addr = alloca <2 x x86_amx>, align 4 - ret void -} - -; CHECK: invalid vector element type Index: llvm/test/Verifier/x86_amx3.ll =================================================================== --- llvm/test/Verifier/x86_amx3.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -define void @f(x86_amx %A, x86_amx %B) { -entry: - alloca x86_amx -; CHECK: invalid type for alloca - ret void -} Index: llvm/test/Verifier/x86_amx4.ll =================================================================== --- llvm/test/Verifier/x86_amx4.ll +++ /dev/null @@ -1,4 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -@GV = external global x86_amx -; CHECK: invalid type for global variable Index: llvm/test/Verifier/x86_amx5.ll =================================================================== --- llvm/test/Verifier/x86_amx5.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -define void @f(x86_amx %A) { -entry: - ret void -} -; CHECK: Function takes x86_amx but isn't an intrinsic Index: llvm/test/Verifier/x86_amx6.ll =================================================================== --- llvm/test/Verifier/x86_amx6.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -define x86_amx @f() { -entry: - ret x86_amx undef -} -; CHECK: Function returns a x86_amx but isn't an intrinsic Index: llvm/test/Verifier/x86_amx7.ll =================================================================== --- llvm/test/Verifier/x86_amx7.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -define void @f() { -entry: - call x86_amx () undef () - ret void -} -; CHECK: Return type cannot be x86_amx for indirect call! Index: llvm/test/Verifier/x86_amx9.ll =================================================================== --- llvm/test/Verifier/x86_amx9.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s - -@buf = dso_local global [1024 x i8] zeroinitializer, align 16 - -define dso_local void @test_tile_init(i16 signext %row, i16 signext %col) { -entry: - tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, ptr @buf, i64 64, x86_amx bitcast (<256 x i32> to x86_amx)) - ret void -} -; CHECK: const x86_amx is not allowed in argument! - -declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) Index: llvm/tools/llvm-c-test/echo.cpp =================================================================== --- llvm/tools/llvm-c-test/echo.cpp +++ llvm/tools/llvm-c-test/echo.cpp @@ -153,8 +153,6 @@ LLVMGetVectorSize(Src)); case LLVMMetadataTypeKind: return LLVMMetadataTypeInContext(Ctx); - case LLVMX86_AMXTypeKind: - return LLVMX86AMXTypeInContext(Ctx); case LLVMX86_MMXTypeKind: return LLVMX86MMXTypeInContext(Ctx); case LLVMTokenTypeKind: Index: llvm/utils/emacs/llvm-mode.el =================================================================== --- llvm/utils/emacs/llvm-mode.el +++ llvm/utils/emacs/llvm-mode.el @@ -36,7 +36,7 @@ '("%[-]?[0-9]+" . font-lock-variable-name-face) ;; Types `(,(regexp-opt - '("void" "i1" "i8" "i16" "i32" "i64" "i128" "half" "bfloat" "float" "double" "fp128" "x86_fp80" "ppc_fp128" "x86_mmx" "x86_amx" + '("void" "i1" "i8" "i16" "i32" "i64" "i128" "half" "bfloat" "float" "double" "fp128" "x86_fp80" "ppc_fp128" "x86_mmx" "type" "label" "opaque" "token") 'symbols) . font-lock-type-face) ;; Integer literals '("\\b[-]?[0-9]+\\b" . font-lock-preprocessor-face) Index: llvm/utils/kate/llvm.xml =================================================================== --- llvm/utils/kate/llvm.xml +++ llvm/utils/kate/llvm.xml @@ -141,7 +141,6 @@ x86_fp80 ppc_fp128 x86_mmx - x86_amx void label metadata Index: llvm/utils/llvm.grm =================================================================== --- llvm/utils/llvm.grm +++ llvm/utils/llvm.grm @@ -195,7 +195,7 @@ GlobalVarAttribute ::= SectionString | align EUINT64VAL ; PrimType ::= INTTYPE | half | bfloat | float | double | "ppc_fp128" | fp128 - | "x86_fp80" | "x86_mmx" | "x86_amx" | - label ; + | "x86_fp80" | "x86_mmx" | - label ; Types ::= opaque @@ -420,5 +420,5 @@ | OptVolatile store ResolvedVal ^ "," Types ValueRef OptCAlign | getresult Types ValueRef ^ "," EUINT64VAL | getelementptr OptInBounds Types ValueRef IndexList - | extractvalue Types ValueRef ^ ConstantIndexList + | extractvalue Types ValueRef ^ ConstantIndexList | insertvalue Types ValueRef ^ "," Types ValueRef ^ ConstantIndexList ; Index: llvm/utils/vim/syntax/llvm.vim =================================================================== --- llvm/utils/vim/syntax/llvm.vim +++ llvm/utils/vim/syntax/llvm.vim @@ -15,7 +15,7 @@ " Types also include struct, array, vector, etc. but these don't " benefit as much from having dedicated highlighting rules. syn keyword llvmType void half bfloat float double x86_fp80 fp128 ppc_fp128 -syn keyword llvmType label metadata x86_mmx x86_amx +syn keyword llvmType label metadata x86_mmx syn keyword llvmType type label opaque token ptr syn match llvmType /\/ Index: llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml =================================================================== --- llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml +++ llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml @@ -23,7 +23,6 @@ \\blabel\\b|\ \\bmetadata\\b|\ \\bx86_mmx\\b|\ - \\bx86_amx\\b|\ \\btype\\b|\ \\blabel\\b|\ \\bopaque\\b|\