diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -104,6 +104,7 @@ TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -251,7 +251,7 @@ _tile1024i tile; } __tile1024i; -__DEFAULT_FN_ATTRS_INT8 +__DEFAULT_FN_ATTRS_TILE static void __tile_loadd(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); @@ -264,10 +264,15 @@ src1.tile, src2.tile); } -__DEFAULT_FN_ATTRS_INT8 +__DEFAULT_FN_ATTRS_TILE static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { _tile_stored_internal(src.row, src.col, base, stride, src.tile); } +__DEFAULT_FN_ATTRS_TILE +static void __tile_zero(__tile1024i *dst) { + dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); +} + #endif /* __x86_64__ */ #endif /* __AMXINTRIN_H */ diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -52,3 +52,10 @@ //CHECK-NEXT: call void @llvm.x86.tilestored64.internal __tile_stored(buf, STRIDE, c); } + +void test_tile_zero(__tile1024i c) { + //CHECK-LABEL: @test_tile_zero + //CHECK: call x86_amx @llvm.x86.tilezero.internal + //CHECK-NEXT bitcast x86_amx {{%.*}} to <256 x i32> + __tile_zero(&c); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -5057,6 +5057,10 @@ GCCBuiltin<"__builtin_ia32_tilestored64_internal">, Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty, llvm_x86amx_ty], []>; + def int_x86_tilezero_internal : + GCCBuiltin<"__builtin_ia32_tilezero_internal">, + Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty], + []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -494,6 +494,12 @@ MI.setDesc(TII->get(X86::TILESTORED)); return true; } + case X86::PTILEZEROV: { + for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TILEZERO)); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4641,6 +4641,18 @@ ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tilezero_internal: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc = X86::PTILEZEROV; + SDValue Chain = Node->getOperand(0); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; + MachineSDNode *CNode = + CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } } break; } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -62,6 +62,9 @@ def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4, TILECFG:$cfg), []>; + def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, + TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -132,6 +132,7 @@ llvm_unreachable("Unexpected machine instruction on tile"); case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: MachineOperand &MO1 = const_cast(MI.getOperand(1)); MachineOperand &MO2 = const_cast(MI.getOperand(2)); ShapeT Shape(&MO1, &MO2, MRI); @@ -230,6 +231,7 @@ case X86::PTILELOADDV: case X86::PTILESTOREDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: unsigned NumOperands = MI.getNumOperands(); MI.RemoveOperand(NumOperands - 1); MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -873,6 +873,7 @@ // We only collect the tile shape that is defined. case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d) + + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)