diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -103,6 +103,7 @@ // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tile_loadconfig_internal, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tileloaddt164_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -239,6 +239,14 @@ (__SIZE_TYPE__)(stride)); } +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddt164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} + /// This is internal intrinsic. C/C++ user should avoid calling it directly. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, @@ -311,6 +319,27 @@ dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); } +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst". This intrinsic provides a hint to the implementation +/// that the data will likely not be reused in the near future and the data +/// caching can be optimized accordingly. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADDT1 instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TILE +static void __tile_stream_loadd(__tile1024i *dst, const void *base, + __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); +} + /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -39,6 +39,14 @@ __tile_loadd(&a, buf, STRIDE); } +void test_tile_stream_loadd(short row, short col) { + //CHECK-LABEL: @test_tile_stream_loadd + //CHECK: call x86_amx @llvm.x86.tileloaddt164.internal + //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> + __tile1024i a = {row, col}; + __tile_stream_loadd(&a, buf, STRIDE); +} + void test_tile_dpbssd(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbssd //CHECK: call x86_amx @llvm.x86.tdpbssd.internal diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5050,6 +5050,11 @@ Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], []>; + def int_x86_tileloaddt164_internal : + GCCBuiltin<"__builtin_ia32_tileloaddt164_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], + []>; def int_x86_tdpbssd_internal : GCCBuiltin<"__builtin_ia32_tdpbssd_internal">, Intrinsic<[llvm_x86amx_ty], diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -554,10 +554,13 @@ MI.setDesc(TII->get(X86::LDTILECFG)); return true; } - case X86::PTILELOADDV: { + case X86::PTILELOADDV: + case X86::PTILELOADDT1V: { for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); - MI.setDesc(TII->get(X86::TILELOADD)); + unsigned Opc = + Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1; + MI.setDesc(TII->get(Opc)); return true; } case X86::PTDPBSSDV: diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -122,7 +122,8 @@ } bool X86FastTileConfig::isTileLoad(MachineInstr &MI) { - return MI.getOpcode() == X86::PTILELOADDV; + return MI.getOpcode() == X86::PTILELOADDV || + MI.getOpcode() == X86::PTILELOADDT1V; } bool X86FastTileConfig::isTileStore(MachineInstr &MI) { return MI.getOpcode() == X86::PTILESTOREDV; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4617,10 +4617,13 @@ ReplaceNode(Node, Res); return; } - case Intrinsic::x86_tileloadd64_internal: { + case Intrinsic::x86_tileloadd64_internal: + case Intrinsic::x86_tileloaddt164_internal: { if (!Subtarget->hasAMXTILE()) break; - unsigned Opc = X86::PTILELOADDV; + unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal + ? X86::PTILELOADDV + : X86::PTILELOADDT1V; // _tile_loadd_internal(row, col, buf, STRIDE) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -53,6 +53,9 @@ def PTILELOADDV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3), []>; + def PTILELOADDT1V : PseudoI<(outs TILE:$dst), (ins GR16:$src1, + GR16:$src2, + opaquemem:$src3), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4), []>; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -121,6 +121,7 @@ default: llvm_unreachable("Expect amx intrinsics"); case Intrinsic::x86_tileloadd64_internal: + case Intrinsic::x86_tileloaddt164_internal: case Intrinsic::x86_tilestored64_internal: { Row = II->getArgOperand(0); Col = II->getArgOperand(1); diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -892,6 +892,7 @@ } // We only collect the tile shape that is defined. case X86::PTILELOADDV: + case X86::PTILELOADDT1V: case X86::PTDPBSSDV: case X86::PTDPBSUDV: case X86::PTDPBUSDV: diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll --- a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -23,6 +23,7 @@ ; CHECK-NEXT: tdpbusd %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tdpbuud %tmm2, %tmm1, %tmm0 ; CHECK-NEXT: tdpbf16ps %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tileloaddt1 (%rsi,%rdx), %tmm1 ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) ; CHECK-NEXT: tilerelease ; CHECK-NEXT: vzeroupper @@ -35,6 +36,7 @@ %d2 = call x86_amx @llvm.x86.tdpbusd.internal(i16 8, i16 8, i16 8, x86_amx %d1, x86_amx %a, x86_amx %b) %d3 = call x86_amx @llvm.x86.tdpbuud.internal(i16 8, i16 8, i16 8, x86_amx %d2, x86_amx %a, x86_amx %b) %d4 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %d3, x86_amx %a, x86_amx %b) + %e = call x86_amx @llvm.x86.tileloaddt164.internal(i16 8, i16 8, i8* %base, i64 %stride) call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d4) ret void @@ -42,6 +44,7 @@ declare x86_amx @llvm.x86.tilezero.internal(i16, i16) declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tileloaddt164.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)