diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -461,25 +461,13 @@ case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; - case X86::PLDTILECFG: { - MI.RemoveOperand(0); - MI.setDesc(TII->get(X86::LDTILECFG)); - return true; - } - case X86::PSTTILECFG: { - MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg - MI.setDesc(TII->get(X86::STTILECFG)); - return true; - } case X86::PTILELOADDV: { - MI.RemoveOperand(8); // Remove $tmmcfg for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILELOADD)); return true; } case X86::PTDPBSSDV: { - MI.RemoveOperand(7); // Remove $tmmcfg MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); @@ -488,14 +476,13 @@ return true; } case X86::PTILESTOREDV: { - MI.RemoveOperand(8); // Remove $tmmcfg for (int i = 1; i >= 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } case X86::PTILEZEROV: { - for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + for (int i = 2; i > 0; --i) // Remove row, col MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILEZERO)); return true; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2094,8 +2094,12 @@ // Emit tilerelease for AMX kernel. const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (!MRI.reg_nodbg_empty(X86::TMMCFG)) - BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); + for (unsigned I = 0; I < RC->getNumRegs(); I++) + if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) { + BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + break; + } } StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4607,7 +4607,6 @@ SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4617,7 +4616,6 @@ Index, Disp, Segment, - CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4628,14 +4626,12 @@ break; SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), - CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); @@ -4647,8 +4643,7 @@ break; unsigned Opc = X86::PTILEZEROV; SDValue Chain = Node->getOperand(0); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4719,7 +4714,6 @@ SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4730,7 +4724,6 @@ Disp, Segment, Node->getOperand(6), - CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,23 +48,14 @@ VEX, T8XD; // Pseduo instruction for RA. - let hasSideEffects = 1, mayLoad = 1, - Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in - def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; - - let hasSideEffects = 1, mayStore = 1 in - def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; - def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, - opaquemem:$src3, - TILECFG:$cfg), []>; + opaquemem:$src3), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, - TILE:$src4, TILECFG:$cfg), []>; + TILE:$src4), []>; def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, - TILECFG:$cfg), []>; + GR16:$src2), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. @@ -104,7 +95,7 @@ let Constraints = "$src4 = $dst" in def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; + TILE:$src5, TILE:$src6), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3808,10 +3808,6 @@ MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PSTTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) - .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3840,10 +3836,6 @@ MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PLDTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), - FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); @@ -6789,7 +6781,7 @@ // ENDBR instructions should not be scheduled around. unsigned Opcode = MI.getOpcode(); if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 || - Opcode == X86::PLDTILECFG) + Opcode == X86::LDTILECFG) return true; return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -98,10 +98,9 @@ MachineFunctionPass::getAnalysisUsage(AU); } -static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, - const TargetInstrInfo *TII, - MachineRegisterInfo *MRI, - const X86Subtarget *ST) { +static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, + const TargetInstrInfo *TII, MachineRegisterInfo *MRI, + const X86Subtarget *ST) { auto *MBB = MI->getParent(); // FIXME: AMX should assume AVX512 enabled. @@ -117,12 +116,8 @@ } // build psuedo ldtilecfg - Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); - - addFrameReference( - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); - - return VReg; + addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)), + FrameIdx); } static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { @@ -219,26 +214,98 @@ return &*MII; } -static void addTileCFGUse(MachineFunction &MF, Register CFG) { - for (MachineBasicBlock &MBB : MF) { - - // Traverse the basic block. - for (MachineInstr &MI : MBB) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - break; - case X86::PTILELOADDV: - case X86::PTILESTOREDV: - case X86::PTDPBSSDV: - case X86::PTILEZEROV: - unsigned NumOperands = MI.getNumOperands(); - MI.RemoveOperand(NumOperands - 1); - MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); - break; +static bool isAMXInstruction(MachineBasicBlock::iterator MII) { + switch (MII->getOpcode()) { + default: + return false; + case X86::PTILELOADDV: + case X86::PTILESTOREDV: + case X86::PTDPBSSDV: + case X86::PTILEZEROV: + return true; + } +} + +struct BBInfo { + bool HasAMX = false; + bool HasCallBeforeAMX = false; + bool HasAMXBeforeCallInSuccs = false; + MachineInstr *LastCall = nullptr; + + BBInfo() = default; + BBInfo(SmallSet &CfgNeedInsert, MachineBasicBlock *MBB, + MachineInstr *MI = nullptr) { + MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin(); + for (auto E = MBB->end(); MII != E; ++MII) { + if (isAMXInstruction(MII)) { + HasAMX = true; + if (LastCall) + CfgNeedInsert.insert(LastCall); + } else if (MII->isCall()) { + LastCall = &*MII; + if (!HasAMX) + HasCallBeforeAMX = true; + } + } + } +}; + +static void reloadTileConfig(MachineInstr *MI, int FI, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) { + SmallSet CfgNeedInsert; + SmallVector WorkList; + DenseMap BBVisitedInfo; + + MachineBasicBlock *MBB = MI->getParent(); + BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI); + + WorkList.push_back(MBB); + while (!WorkList.empty()) { + MBB = WorkList.pop_back_val(); + for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { + if (!BBVisitedInfo.count(*I)) { + BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I); + WorkList.push_back(*I); } } } + + WorkList.clear(); + for (auto I : BBVisitedInfo) { + WorkList.push_back(I.first); + while (!WorkList.empty()) { + MBB = WorkList.pop_back_val(); + if (BBVisitedInfo[MBB].HasCallBeforeAMX || + (!BBVisitedInfo[MBB].HasAMX && + !BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs)) + continue; + for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) { + if (!BBVisitedInfo.count(*I) || + BBVisitedInfo[*I].HasAMXBeforeCallInSuccs) + continue; + if (BBVisitedInfo[*I].LastCall) + CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall); + BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true; + WorkList.push_back(*I); + } + } + } + + for (auto *I : CfgNeedInsert) { + BitVector UsableRegs(TRI->getNumRegs()); + const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); + for (unsigned J = 0; J < RC->getNumRegs(); J++) + UsableRegs.set(X86::TMM0 + J); + for (MachineOperand &CallMO : I->operands()) { + if (CallMO.isRegMask()) + UsableRegs.clearBitsInMask(CallMO.getRegMask()); + } + if (!UsableRegs.none()) + addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(), + TII->get(X86::LDTILECFG)), + FI); + } } bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { @@ -255,8 +322,8 @@ unsigned Size = ST->getTileConfigSize(); Align Alignment = ST->getTileConfigAlignment(); int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); - addTileCFGUse(mf, CFG); + buildConfigMI(MI, SS, TII, MRI, ST); + reloadTileConfig(MI, SS, TII, TRI); return true; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -639,8 +639,3 @@ let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { - let CopyCost = -1; // Don't allow copying of tile config registers. - let isAllocatable = 1; - let Size = 512; -} diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -22,6 +22,7 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -130,13 +131,14 @@ } MachineInstr *X86TileConfig::getTileConfigPoint() { - for (MachineBasicBlock &MBB : *MF) { - - // Traverse the basic block. - for (MachineInstr &MI : MBB) + MachineBasicBlock *Entry = &*MF->begin(); + ReversePostOrderTraversal RPOT(Entry); + for (MachineBasicBlock *MBB : RPOT) { + for (MachineInstr &MI : *MBB) // Refer X86PreTileConfig.cpp. - // We only support one tile config for now. - if (MI.getOpcode() == X86::PLDTILECFG) + // We only support one tile config for now. The other ldtilecfg + // is for spill purpose and is dominated by the first ldtilecfg. + if (MI.getOpcode() == X86::LDTILECFG) return &MI; } @@ -148,7 +150,7 @@ if (!MI) return; MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(1).getIndex(); + int SS = MI->getOperand(0).getIndex(); BitVector PhysRegs(TRI->getNumRegs()); // Fill in the palette first. diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -1,10 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s - -%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }> +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s @buf = dso_local global [3072 x i8] zeroinitializer, align 64 +define internal void @foo() { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: retq +; +; IPRA-LABEL: foo: +; IPRA: # %bb.0: # %entry +; IPRA-NEXT: retq +entry: + ret void +} + define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-LABEL: test_api: ; CHECK: # %bb.0: @@ -25,7 +36,6 @@ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movw $8, %r15w @@ -36,11 +46,10 @@ ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload @@ -55,16 +64,204 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq +; +; IPRA-LABEL: test_api: +; IPRA: # %bb.0: +; IPRA-NEXT: subq $72, %rsp +; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) +; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; IPRA-NEXT: movl $buf, %eax +; IPRA-NEXT: movl $32, %ecx +; IPRA-NEXT: movw $8, %dx +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 +; IPRA-NEXT: movl $buf+1024, %eax +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 +; IPRA-NEXT: callq foo +; IPRA-NEXT: movl $buf+2048, %eax +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2 +; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 +; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx) +; IPRA-NEXT: addq $72, %rsp +; IPRA-NEXT: tilerelease +; IPRA-NEXT: vzeroupper +; IPRA-NEXT: retq %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - tail call void (...) @foo() + call void @foo() %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) ret void } -declare dso_local void @foo(...) +define dso_local i32 @test_loop(i32 %0) nounwind { +; CHECK-LABEL: test_loop: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8 +; CHECK-NEXT: movl %edi, %r14d +; CHECK-NEXT: callq foo +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: testl %r14d, %r14d +; CHECK-NEXT: jg .LBB2_4 +; CHECK-NEXT: # %bb.1: # %.preheader +; CHECK-NEXT: movl $7, %ebp +; CHECK-NEXT: movl $buf, %r15d +; CHECK-NEXT: movl $32, %r12d +; CHECK-NEXT: movw $8, %bx +; CHECK-NEXT: movl $buf+2048, %r13d +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm0, (%r13,%r12) +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: decl %ebp +; CHECK-NEXT: cmpl $7, %ebp +; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: cmpl $3, %r14d +; CHECK-NEXT: jne .LBB2_4 +; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: testl %ebp, %ebp +; CHECK-NEXT: jne .LBB2_5 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: incl %r14d +; CHECK-NEXT: jmp .LBB2_8 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $32, %eax +; CHECK-NEXT: movl $buf+1024, %ecx +; CHECK-NEXT: movw $8, %dx +; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax) +; CHECK-NEXT: .LBB2_5: +; CHECK-NEXT: decl %r14d +; CHECK-NEXT: .LBB2_8: +; CHECK-NEXT: movl %r14d, %eax +; CHECK-NEXT: addq $3016, %rsp # imm = 0xBC8 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +; +; IPRA-LABEL: test_loop: +; IPRA: # %bb.0: +; IPRA-NEXT: subq $72, %rsp +; IPRA-NEXT: movl %edi, %eax +; IPRA-NEXT: callq foo +; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; IPRA-NEXT: testl %edi, %edi +; IPRA-NEXT: jg .LBB2_4 +; IPRA-NEXT: # %bb.1: # %.preheader +; IPRA-NEXT: movl $7, %ecx +; IPRA-NEXT: movl $buf, %r8d +; IPRA-NEXT: movl $32, %esi +; IPRA-NEXT: movw $8, %di +; IPRA-NEXT: movl $buf+2048, %edx +; IPRA-NEXT: .p2align 4, 0x90 +; IPRA-NEXT: .LBB2_2: # =>This Inner Loop Header: Depth=1 +; IPRA-NEXT: tileloadd (%r8,%rsi), %tmm0 +; IPRA-NEXT: callq foo +; IPRA-NEXT: tilestored %tmm0, (%rdx,%rsi) +; IPRA-NEXT: callq foo +; IPRA-NEXT: decl %ecx +; IPRA-NEXT: cmpl $7, %ecx +; IPRA-NEXT: jne .LBB2_2 +; IPRA-NEXT: # %bb.3: +; IPRA-NEXT: cmpl $3, %eax +; IPRA-NEXT: jne .LBB2_4 +; IPRA-NEXT: # %bb.6: +; IPRA-NEXT: testl %ecx, %ecx +; IPRA-NEXT: jne .LBB2_5 +; IPRA-NEXT: # %bb.7: +; IPRA-NEXT: incl %eax +; IPRA-NEXT: jmp .LBB2_8 +; IPRA-NEXT: .LBB2_4: +; IPRA-NEXT: callq foo +; IPRA-NEXT: movl $32, %ecx +; IPRA-NEXT: movl $buf+1024, %edx +; IPRA-NEXT: movw $8, %si +; IPRA-NEXT: tileloadd (%rdx,%rcx), %tmm0 +; IPRA-NEXT: tilestored %tmm0, (%rdx,%rcx) +; IPRA-NEXT: .LBB2_5: +; IPRA-NEXT: decl %eax +; IPRA-NEXT: .LBB2_8: +; IPRA-NEXT: addq $72, %rsp +; IPRA-NEXT: tilerelease +; IPRA-NEXT: vzeroupper +; IPRA-NEXT: retq + call void @foo() + br label %2 +2: + %3 = icmp sgt i32 %0, 0 + br i1 %3, label %11, label %6 +4: + %5 = icmp eq i32 %0, 3 + br i1 %5, label %13, label %11 +6: + %7 = phi i32 [ %9, %6 ], [ 0, %2 ] + %8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) + call void @foo() + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %8) + call void @foo() + %9 = add i32 %7, 1 + %10 = icmp eq i32 %9, 0 + br i1 %10, label %4, label %6 +11: + call void @foo() + %12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32, x86_amx %12) + br label %17 +13: + %14 = icmp eq i32 %9, 7 + br i1 %14, label %15, label %17 +15: + %16 = add i32 %0, 1 + br label %19 +17: + %18 = sub i32 %0, 1 + br label %19 +19: + %20 = phi i32 [ %16, %15 ], [ %18, %17 ] + ret i32 %20 +} declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll --- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll @@ -5,6 +5,7 @@ ; CHECK-LABEL: test_amx: ; CHECK: # %bb.0: ; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3 +; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7) ret void diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s +@buf = dso_local global [3072 x i8] zeroinitializer, align 16 + +define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movw $8, %dx +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 +; CHECK-NEXT: movl $buf+1024, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 +; CHECK-NEXT: movl $buf+2048, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: jmp foo # TAILCALL + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) + %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) + tail call void @foo() + ret void +} + +define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB1_3 +; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movl $buf+1024, %edx +; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1 +; CHECK-NEXT: movl $buf+2048, %edx +; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx) +; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: .LBB1_3: # %if.false +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movw $8, %dx +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3 +; CHECK-NEXT: movl $buf+1024, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 +; CHECK-NEXT: movl $buf+2048, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3 +; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx) +; CHECK-NEXT: .LBB1_2: # %if.true +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq + call void @foo() + br i1 undef, label %if.true, label %if.false + +if.true: + %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8) + %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) + %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4) + br label %exit + +if.false: + %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) + %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) + %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8) + br label %exit + +exit: + ret void +} + +declare dso_local void @foo() nounwind +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -36,11 +36,10 @@ ; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 @@ -52,11 +51,13 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.false +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3 @@ -68,7 +69,7 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm6, (%r15,%r14) @@ -139,7 +140,6 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: xorl %ebp, %ebp -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -149,7 +149,7 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2