diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -377,6 +377,13 @@ bool checkRegMaskInterference(LiveInterval &LI, BitVector &UsableRegs); + /// Get the interferenced slot index and its regmask for an live interval. + /// Return false if ther is no interference. + bool + getInterferenceRegMasks(LiveInterval &LI, + SmallVectorImpl &RegSlots, + SmallVectorImpl &RegMaskBits); + // Register unit functions. // // Fixed interference occurs when MachineInstrs use physregs directly diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -952,6 +952,56 @@ } } +bool LiveIntervals::getInterferenceRegMasks( + LiveInterval &LI, SmallVectorImpl &RegSlots, + SmallVectorImpl &RegBits) { + if (LI.empty()) + return false; + LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end(); + + // Use a smaller arrays for local live ranges. + ArrayRef Slots; + ArrayRef Bits; + if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) { + Slots = getRegMaskSlotsInBlock(MBB->getNumber()); + Bits = getRegMaskBitsInBlock(MBB->getNumber()); + } else { + Slots = getRegMaskSlots(); + Bits = getRegMaskBits(); + } + + // We are going to enumerate all the register mask slots contained in LI. + // Start with a binary search of RegMaskSlots to find a starting point. + ArrayRef::iterator SlotI = llvm::lower_bound(Slots, LiveI->start); + ArrayRef::iterator SlotE = Slots.end(); + + // No slots in range, LI begins after the last call. + if (SlotI == SlotE) + return false; + + bool Found = false; + while (true) { + assert(*SlotI >= LiveI->start); + // Loop over all slots overlapping this segment. + while (*SlotI < LiveI->end) { + // *SlotI overlaps LI. Collect mask bits. + Found = true; + RegSlots.push_back(*SlotI); + RegBits.push_back(Bits[SlotI - Slots.begin()]); + if (++SlotI == SlotE) + return Found; + } + // *SlotI is beyond the current LI segment. + LiveI = LI.advanceTo(LiveI, *SlotI); + if (LiveI == LiveE) + return Found; + // Advance SlotI until it overlaps. + while (*SlotI < LiveI->start) + if (++SlotI == SlotE) + return Found; + } +} + //===----------------------------------------------------------------------===// // IntervalUpdate class. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -461,25 +461,13 @@ case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; - case X86::PLDTILECFG: { - MI.RemoveOperand(0); - MI.setDesc(TII->get(X86::LDTILECFG)); - return true; - } - case X86::PSTTILECFG: { - MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg - MI.setDesc(TII->get(X86::STTILECFG)); - return true; - } case X86::PTILELOADDV: { - MI.RemoveOperand(8); // Remove $tmmcfg for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILELOADD)); return true; } case X86::PTDPBSSDV: { - MI.RemoveOperand(7); // Remove $tmmcfg MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); @@ -488,14 +476,13 @@ return true; } case X86::PTILESTOREDV: { - MI.RemoveOperand(8); // Remove $tmmcfg for (int i = 1; i >= 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } case X86::PTILEZEROV: { - for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + for (int i = 2; i > 0; --i) // Remove row, col MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILEZERO)); return true; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2094,8 +2094,14 @@ // Emit tilerelease for AMX kernel. const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (!MRI.reg_nodbg_empty(X86::TMMCFG)) - BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); + unsigned TileRegNum = RC->getNumRegs(); + for (unsigned I = 0; I < TileRegNum; I++) { + if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) { + BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + break; + } + } } StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4606,7 +4606,6 @@ SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4616,7 +4615,6 @@ Index, Disp, Segment, - CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4627,14 +4625,12 @@ break; SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), - CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); @@ -4646,8 +4642,7 @@ break; unsigned Opc = X86::PTILEZEROV; SDValue Chain = Node->getOperand(0); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4718,7 +4713,6 @@ SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4729,7 +4723,6 @@ Disp, Segment, Node->getOperand(6), - CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,23 +48,14 @@ VEX, T8XD; // Pseduo instruction for RA. - let hasSideEffects = 1, mayLoad = 1, - Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in - def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; - - let hasSideEffects = 1, mayStore = 1 in - def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; - def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, - opaquemem:$src3, - TILECFG:$cfg), []>; + opaquemem:$src3), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, - TILE:$src4, TILECFG:$cfg), []>; + TILE:$src4), []>; def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, - TILECFG:$cfg), []>; + GR16:$src2), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. @@ -104,7 +95,7 @@ let Constraints = "$src4 = $dst" in def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; + TILE:$src5, TILE:$src6), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3808,10 +3808,6 @@ MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PSTTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) - .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3840,10 +3836,6 @@ MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PLDTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), - FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -38,6 +38,7 @@ #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -62,8 +63,13 @@ const TargetInstrInfo *TII; MachineDominatorTree *DomTree = nullptr; MachineRegisterInfo *MRI = nullptr; + LiveIntervals *LIS = nullptr; + SmallVector VTileRegs; + MachineInstr *TileConfigMI = nullptr; + void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx); MachineInstr *getTileConfigPoint(); + void reloadTileConfig(int FI); public: X86PreTileConfig() : MachineFunctionPass(ID) {} @@ -88,20 +94,21 @@ INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } -static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, - const TargetInstrInfo *TII, - MachineRegisterInfo *MRI, - const X86Subtarget *ST) { +void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI, + int FrameIdx) { auto *MBB = MI->getParent(); // FIXME: AMX should assume AVX512 enabled. @@ -111,18 +118,15 @@ BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) .addReg(Zmm, RegState::Undef) .addReg(Zmm, RegState::Undef); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), - FrameIdx) - .addReg(Zmm); + TileConfigMI = &*addFrameReference(BuildMI(*MBB, MI, DebugLoc(), + TII->get(X86::VMOVUPSZmr)), + FrameIdx) + .addReg(Zmm); } // build psuedo ldtilecfg - Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); - - addFrameReference( - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); - - return VReg; + addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)), + FrameIdx); } static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { @@ -151,6 +155,7 @@ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); if (RC.getID() != X86::TILERegClassID) continue; + VTileRegs.push_back(VirtReg); // Find the common dominator for all MI that define tile register. for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { @@ -219,23 +224,138 @@ return &*MII; } -static void addTileCFGUse(MachineFunction &MF, Register CFG) { - for (MachineBasicBlock &MBB : MF) { +void X86PreTileConfig::reloadTileConfig(int FI) { + SmallSet MIVisited; + const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); + auto TileRegNum = RC->getNumRegs(); + + for (Register VReg : VTileRegs) { + BitVector UsableRegs(TRI->getNumRegs()); + for (unsigned I = 0; I < TileRegNum; I++) + UsableRegs.set(X86::TMM0 + I); + SmallVector RegSlots; + SmallVector RegMasks; + LiveInterval &LI = LIS->getInterval(VReg); + if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks)) + continue; + for (unsigned I = 0; I < RegSlots.size(); I++) { + SlotIndex &SI = RegSlots[I]; + MachineInstr *MI = LIS->getInstructionFromIndex(SI); + // We have reload the tile config register before. + if (MIVisited.count(MI)) + continue; + // For inline assembly, we don't reload tile config register. + // If there is any ldtilecfg instruction in inline assembly, + // it is user's reponsibility to restore everything. + if (!MI->isCall()) + continue; + UsableRegs.clearBitsInMask(RegMasks[I]); + MIVisited.insert(MI); + // There is no interference in callee. This is benifited from + // IPRA. + if (UsableRegs.none()) + continue; + + // build psuedo ldtilecfg + auto *MBB = MI->getParent(); + auto MII = MachineBasicBlock::iterator(MI); + MII++; + addFrameReference( + BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI); + } + } + // We just check tile data register interference, we also need check tile + // config register interference. Since we don't model the config register + // we should check interference from the ldtilecfg to each tile data register + // def. + // ldtilecfg + // / \ + // BB1 BB2 + // / \ + // call BB3 + // / \ + // %1=tileload %2=tilezero + // We can start from the instruction of each tile def, and backward to + // ldtilecfg. If there is any call instruction, and tile data register is + // not preserved, we should insert ldtilecfg after the call instruction. + SmallSet MBBVisited; + for (Register VReg : VTileRegs) { + for (MachineOperand &MO : MRI->def_operands(VReg)) { + if (MO.isUndef()) + continue; + MachineInstr *MI = MO.getParent(); + // May be PHI instructiion. + // There must be several def tile before PHI instruction. + if (MI->isTransient()) + continue; + + bool Terminate = false; + MachineBasicBlock *MBB = MI->getParent(); + // backward to see if there is any call instruction after ldtilecfg. + std::queue WorkList; + WorkList.push(MBB); + bool First = true; + while (!WorkList.empty()) { + MBB = WorkList.front(); + WorkList.pop(); + // If we have iterate the basic block before, don't iterate it and + // its predecessor again. This may be caused by loop, or it has a + // cross path from several successor, or it has been iterated when + // handle other tile register. In below example, BB1 hit the condition. + // ldtilecfg + // | + // ---BB1--- + // / \ + // BB2 BB3 + // / \ + // %1=tileload %2=tilezero + if (MBBVisited.count(MBB)) + continue; + // For the first MBB, we start from the amx instruction which def + // tile register. + auto I = (First) ? MI->getReverseIterator() : MBB->instr_rbegin(); + for (auto E = MBB->instr_rend(); I != E; ++I) { + // If it is inserted point for ldtilecfg, then we've finished + // backward. + if (&*I == TileConfigMI) { + Terminate = true; + break; + } + if (MIVisited.count(&*I)) + continue; + if (!I->isCall()) + continue; + BitVector UsableRegs(TRI->getNumRegs()); + for (unsigned I = 0; I < TileRegNum; I++) + UsableRegs.set(X86::TMM0 + I); + for (MachineOperand &CallMO : I->operands()) { + if (CallMO.isRegMask()) + UsableRegs.clearBitsInMask(CallMO.getRegMask()); + } + // Record the call to avoid double ldtilecfg insert. + MIVisited.insert(&*I); + if (UsableRegs.none()) + continue; + // Insert ldtilecfg after call instruction. + --I; + addFrameReference( + BuildMI(*MBB, *I, DebugLoc(), TII->get(X86::LDTILECFG)), FI); + } + // We encounter visited MachineInst, so we don't need to do backward + // again. + if (Terminate) + break; + // Next we will iterate its predecessor. + for (MachineBasicBlock::pred_iterator S = MBB->pred_begin(), + E = MBB->pred_end(); + S != E; S++) + WorkList.push(*S); - // Traverse the basic block. - for (MachineInstr &MI : MBB) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - break; - case X86::PTILELOADDV: - case X86::PTILESTOREDV: - case X86::PTDPBSSDV: - case X86::PTILEZEROV: - unsigned NumOperands = MI.getNumOperands(); - MI.RemoveOperand(NumOperands - 1); - MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); - break; + // The first the MBB may be visited for the second time when it is in + // a loop. + if (!First) + MBBVisited.insert(MBB); + First = false; } } } @@ -248,15 +368,17 @@ TRI = ST->getRegisterInfo(); TII = mf.getSubtarget().getInstrInfo(); DomTree = &getAnalysis(); + LIS = &getAnalysis(); - MachineInstr *MI = getTileConfigPoint(); - if (!MI) + auto *TileConfigPoint = getTileConfigPoint(); + if (!TileConfigPoint) return false; unsigned Size = ST->getTileConfigSize(); Align Alignment = ST->getTileConfigAlignment(); int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); - addTileCFGUse(mf, CFG); + buildConfigMI(TileConfigPoint, SS); + reloadTileConfig(SS); + VTileRegs.clear(); return true; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -639,8 +639,3 @@ let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { - let CopyCost = -1; // Don't allow copying of tile config registers. - let isAllocatable = 1; - let Size = 512; -} diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -22,6 +22,7 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -130,13 +131,14 @@ } MachineInstr *X86TileConfig::getTileConfigPoint() { - for (MachineBasicBlock &MBB : *MF) { - - // Traverse the basic block. - for (MachineInstr &MI : MBB) + MachineBasicBlock *Entry = &*MF->begin(); + ReversePostOrderTraversal RPOT(Entry); + for (MachineBasicBlock *MBB : RPOT) { + for (MachineInstr &MI : *MBB) // Refer X86PreTileConfig.cpp. - // We only support one tile config for now. - if (MI.getOpcode() == X86::PLDTILECFG) + // We only support one tile config for now. The other ldtilecfg + // is for spill purpose and is dominated by the first ldtilecfg. + if (MI.getOpcode() == X86::LDTILECFG) return &MI; } @@ -148,7 +150,7 @@ if (!MI) return; MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(1).getIndex(); + int SS = MI->getOperand(0).getIndex(); BitVector PhysRegs(TRI->getNumRegs()); // Fill in the palette first. diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -1,10 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s - -%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }> +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s @buf = dso_local global [3072 x i8] zeroinitializer, align 64 +define internal void @foo() #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq +; +; IPRA-LABEL: foo: +; IPRA: # %bb.0: # %entry +; IPRA-NEXT: pushq %rbp +; IPRA-NEXT: .cfi_def_cfa_offset 16 +; IPRA-NEXT: .cfi_offset %rbp, -16 +; IPRA-NEXT: movq %rsp, %rbp +; IPRA-NEXT: .cfi_def_cfa_register %rbp +; IPRA-NEXT: popq %rbp +; IPRA-NEXT: .cfi_def_cfa %rsp, 8 +; IPRA-NEXT: retq +entry: + ret void +} + define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-LABEL: test_api: ; CHECK: # %bb.0: @@ -25,7 +50,6 @@ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movw $8, %r15w @@ -36,11 +60,10 @@ ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload @@ -55,17 +78,48 @@ ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq +; +; IPRA-LABEL: test_api: +; IPRA: # %bb.0: +; IPRA-NEXT: pushq %rbp +; IPRA-NEXT: subq $64, %rsp +; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; IPRA-NEXT: vmovdqu64 %zmm0, (%rsp) +; IPRA-NEXT: movb $1, (%rsp) +; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) +; IPRA-NEXT: ldtilecfg (%rsp) +; IPRA-NEXT: movl $buf, %eax +; IPRA-NEXT: movl $32, %ecx +; IPRA-NEXT: movw $8, %dx +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 +; IPRA-NEXT: movl $buf+1024, %eax +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 +; IPRA-NEXT: callq foo +; IPRA-NEXT: movl $buf+2048, %eax +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2 +; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 +; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx) +; IPRA-NEXT: addq $64, %rsp +; IPRA-NEXT: popq %rbp +; IPRA-NEXT: tilerelease +; IPRA-NEXT: vzeroupper +; IPRA-NEXT: retq %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - tail call void (...) @foo() + call void @foo() %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) ret void } -declare dso_local void @foo(...) - declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) + +attributes #0 = { noinline nounwind optnone uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll --- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll @@ -5,7 +5,6 @@ ; CHECK-LABEL: test_amx: ; CHECK: # %bb.0: ; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3 -; CHECK-NEXT: retq call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7) ret void } diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s +@buf = dso_local global [3072 x i8] zeroinitializer, align 16 + +define dso_local void @test1(i16 signext %0, i16 signext %1) local_unnamed_addr { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movw $8, %dx +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 +; CHECK-NEXT: movl $buf+1024, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 +; CHECK-NEXT: movl $buf+2048, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: jmp foo # TAILCALL + %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) + %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) + %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) + tail call void @foo() + ret void +} + +define dso_local void @test2(i16 signext %0, i16 signext %1) local_unnamed_addr { +; CHECK-LABEL: test2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB1_3 +; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movl $buf+1024, %edx +; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1 +; CHECK-NEXT: movl $buf+2048, %edx +; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx) +; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: .LBB1_3: # %if.false +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: movw $8, %dx +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3 +; CHECK-NEXT: movl $buf+1024, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 +; CHECK-NEXT: movl $buf+2048, %eax +; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3 +; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx) +; CHECK-NEXT: .LBB1_2: # %if.true +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq + call void @foo() + br i1 undef, label %if.true, label %if.false + +if.true: + %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8) + %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) + %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4) + br label %exit + +if.false: + %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) + %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) + %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) + %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7) + tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8) + br label %exit + +exit: + ret void +} + +declare dso_local void @foo() local_unnamed_addr +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -36,11 +36,10 @@ ; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 @@ -52,11 +51,13 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.false +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3 @@ -68,7 +69,7 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm6, (%r15,%r14) @@ -139,7 +140,6 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: xorl %ebp, %ebp -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -149,7 +149,7 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -120,6 +120,8 @@ ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Slot index numbering +; CHECK-NEXT: Live Interval Analysis ; CHECK-NEXT: Tile Register Pre-configure ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: Process Implicit Definitions