diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -6,31 +6,19 @@ // //===----------------------------------------------------------------------===// // -/// \file Pass to pre-config the shape of AMX register -/// AMX register need to be configured before use. The shape of AMX register -/// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions. -/// The pldtilecfg is to config tile registers. It should dominator all AMX -/// instructions. The pldtilecfg produce a virtual cfg register and the cfg -/// register is used by all AMX instructions. -/// This pass is to find the common dominator of all AMX instructions and -/// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg -/// produces is inserted as the last operand of each AMX instruction. We use -/// this scheme to model the def-use relationship between AMX config instruction -/// and other AMX instructions. Below is an example. +/// \file Pass to pre-config the shapes of AMX registers +/// AMX register needs to be configured before use. The shapes of AMX register +/// are encoded in the 1st and 2nd machine operand of AMX pseudo instructions. /// -/// ----B1---- -/// / \ -/// / \ -/// B2 B3 -/// %1:tile = PTILELOADDV %2:tile = PTILELOADDV +/// The instruction ldtilecfg is used to config the shapes. It must dominate all +/// AMX instructions and postdominate all variable shapes. /// -/// is transformed to +/// The shape register is caller saved according to ABI. We need to insert +/// ldtilecfg again after the call instruction if callee clobbers any AMX +/// registers. /// -/// B1 -/// %25:tilecfg = PLDTILECFG -/// / \ -/// / \ -/// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25 +/// This pass calculates all points that ldtilecfg need to be inserted to and +/// insert them. It reports error if the domination conditions aren't met. // //===----------------------------------------------------------------------===// @@ -38,9 +26,9 @@ #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" @@ -51,19 +39,177 @@ using namespace llvm; #define DEBUG_TYPE "tile-pre-config" +#define ASSERT_VALID_COMPARE \ + assert((MI || RHS.MI) && "Cannot compare both uninitiated nodes"); \ + assert((!MI || !RHS.MI || MI->getParent() == RHS.MI->getParent()) && \ + "Cannot compare between different BBs"); +#define REPORT_CONFIG_FAIL \ + report_fatal_error( \ + MF.getName() + \ + ": Failed to config tile register, please define the shape earlier"); namespace { class X86PreTileConfig : public MachineFunctionPass { - // context - MachineFunction *MF = nullptr; - const X86Subtarget *ST = nullptr; - const TargetRegisterInfo *TRI; - const TargetInstrInfo *TII; - MachineDominatorTree *DomTree = nullptr; + struct MIRef { + MachineInstr *MI = nullptr; + size_t Pos = 0; + MIRef() = default; + MIRef(MachineInstr *MI) + : MI(MI), Pos(std::distance(MI->getParent()->instr_begin(), + MI->getIterator())) {} + MIRef(MachineInstr *MI, size_t Pos) : MI(MI), Pos(Pos) {} + MIRef(MachineBasicBlock::iterator MII) + : MI(&*MII), Pos(std::distance(MII->getParent()->begin(), MII)) {} + MachineInstr *operator->() { return MI; } + operator bool() const { return MI != nullptr; } + bool operator==(const MIRef &RHS) const { return MI == RHS.MI; } + bool operator<(const MIRef &RHS) const { + ASSERT_VALID_COMPARE; + return Pos < RHS.Pos; + } + bool operator>(const MIRef &RHS) const { + ASSERT_VALID_COMPARE; + return Pos > RHS.Pos; + } + }; + + struct BBInfo { + MIRef FirstAMX; + MIRef LastCall; + MIRef LastShape; + bool NeedTileCfgLiveIn = false; + bool PostDominateAllShape = true; + }; + + struct DFSPredsUpdater { + X86PreTileConfig *PTC; + SmallVector WorkList; + virtual void update(MachineBasicBlock *, MachineBasicBlock *) = 0; + DFSPredsUpdater(X86PreTileConfig *PTC, MachineBasicBlock *MBB) + : PTC(PTC), WorkList({MBB}) {} + void run() { + while (!WorkList.empty()) { + MachineBasicBlock *MBB = WorkList.pop_back_val(); + std::for_each(MBB->pred_begin(), MBB->pred_end(), [&](auto *Pred) { + PTC->getBBInfo(Pred); + update(Pred, MBB); + }); + } + } + }; + + struct UpdateLastCalls : DFSPredsUpdater { + UpdateLastCalls(X86PreTileConfig *PTC, MachineBasicBlock *MBB) + : DFSPredsUpdater(PTC, MBB) {} + void update(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + // SuccBB needs tile config to live in, so we mark the last call in MBB + // to be needed a ldtilecfg. + // We don't need to update MBB's predBB if we find MBB has call or MBB + // already has NeedTileCfgLiveIn = true, which meens either + // 1. This MBB has been updated or + // 2. This MBB has AMX, which will be updated later. + if (PTC->BBVisitedInfo[MBB].LastCall) { + PTC->CfgNeedInsert.insert(PTC->BBVisitedInfo[MBB].LastCall); + } else if (!PTC->BBVisitedInfo[MBB].NeedTileCfgLiveIn) { + PTC->BBVisitedInfo[MBB].NeedTileCfgLiveIn = true; + WorkList.push_back(MBB); + } + } + }; + + struct UpdateShapeDominators : DFSPredsUpdater { + UpdateShapeDominators(X86PreTileConfig *PTC, MachineBasicBlock *MBB) + : DFSPredsUpdater(PTC, MBB) {} + void update(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { + // Since there's a shape def in SuccBB or successors of SuccBB, we need to + // clear the flag PostDominateAllShape for MBB and all its predecessors. + // The exception is when SuccBB happens to be header of a loop, we cannot + // clear the flag for the end BB of the loop based on the header. + if (PTC->BBVisitedInfo[MBB].PostDominateAllShape && + (!PTC->MLI->isLoopHeader(Succ) || + PTC->MLI->getLoopFor(Succ)->getBottomBlock() != MBB)) { + PTC->BBVisitedInfo[MBB].PostDominateAllShape = false; + WorkList.push_back(MBB); + } + } + }; + + BitVector AMXRegs; + MachineLoopInfo *MLI = nullptr; + SmallSet CfgNeedInsert; MachineRegisterInfo *MRI = nullptr; + DenseMap BBVisitedInfo; + + bool isDestructiveCall(MachineInstr &MI) { + BitVector UsableRegs(AMXRegs); + auto Iter = llvm::find_if( + MI.operands(), [](MachineOperand &MO) { return MO.isRegMask(); }); + if (Iter == MI.operands_end()) + return false; + UsableRegs.clearBitsInMask(Iter->getRegMask()); + return !UsableRegs.none(); + } + + void getBBInfo(MachineBasicBlock *MBB) { + if (!BBVisitedInfo.count(MBB)) { + BBVisitedInfo[MBB] = BBInfo(); + size_t Pos = 0; + for (auto &MI : *MBB) { + if (isAMXInstruction(MI)) { + // If there's call before the AMX, we need to reload tile config. + if (BBVisitedInfo[MBB].LastCall) + CfgNeedInsert.insert(BBVisitedInfo[MBB].LastCall); + else /* Otherwise, we need tile config to live in this BB. */ + BBVisitedInfo[MBB].NeedTileCfgLiveIn = true; + // Always record the first AMX in case there's shape def after it. + if (!BBVisitedInfo[MBB].FirstAMX) + BBVisitedInfo[MBB].FirstAMX = MIRef(&MI, Pos); + } else if (MI.isCall() && isDestructiveCall(MI)) { + // Record the call only if the callee clobbers all AMX registers. + BBVisitedInfo[MBB].LastCall = MIRef(&MI, Pos); + } + ++Pos; + } + } + } + + void collectShapeInfo(ShapeT Shape) { + for (auto *ShapeMO : {Shape.getRow(), Shape.getCol()}) { + Register ShapeReg = ShapeMO->getReg(); + for (MachineOperand &MO : MRI->def_operands(ShapeReg)) { + MachineInstr *MI = MO.getParent(); + if (MI->isMoveImmediate()) + continue; + MIRef MIR(MI); + MachineBasicBlock *MBB = MIR->getParent(); + getBBInfo(MBB); + if (!BBVisitedInfo[MBB].LastShape) + UpdateShapeDominators(this, MBB).run(); + if (BBVisitedInfo[MBB].LastShape < MIR) + BBVisitedInfo[MBB].LastShape = MIR; + } + } + } - MachineInstr *getTileConfigPoint(); + bool isAMXInstruction(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case X86::PTILESTOREDV: + collectShapeInfo(ShapeT(&MI.getOperand(0), &MI.getOperand(1), MRI)); + return true; + case X86::PTILELOADDV: + case X86::PTDPBSSDV: + case X86::PTDPBSUDV: + case X86::PTDPBUSDV: + case X86::PTDPBUUDV: + case X86::PTILEZEROV: + case X86::PTDPBF16PSV: + collectShapeInfo(ShapeT(&MI.getOperand(1), &MI.getOperand(2), MRI)); + return true; + } + } public: X86PreTileConfig() : MachineFunctionPass(ID) {} @@ -88,278 +234,133 @@ INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Pre-configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } -static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, - const TargetInstrInfo *TII, MachineRegisterInfo *MRI, - const X86Subtarget *ST) { - auto *MBB = MI->getParent(); +bool X86PreTileConfig::runOnMachineFunction(MachineFunction &MF) { + const X86Subtarget &ST = MF.getSubtarget(); + const TargetInstrInfo *TII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - // Zero stack slot. - if (ST->hasAVX512()) { + MRI = &MF.getRegInfo(); + MLI = &getAnalysis(); + + AMXRegs.resize(TRI->getNumRegs()); + for (unsigned I = 0; I < RC->getNumRegs(); I++) + AMXRegs.set(X86::TMM0 + I); + + CfgNeedInsert.clear(); + BBVisitedInfo.clear(); + for (auto &MBB : MF) + getBBInfo(&MBB); + + // We must update NeedTileCfgLiveIn after all BBs been iterated. + std::for_each(MF.rbegin(), MF.rend(), [&](auto &MBB) { + if (BBVisitedInfo[&MBB].FirstAMX && BBVisitedInfo[&MBB].NeedTileCfgLiveIn) + UpdateLastCalls(this, &MBB).run(); + }); + + // There's no AMX instruction if we didn't find a tile config live in point. + if (CfgNeedInsert.empty() && !BBVisitedInfo[&MF.front()].NeedTileCfgLiveIn) + return false; + + DebugLoc DL; + SmallSet Inserted; + int SS = MF.getFrameInfo().CreateStackObject( + ST.getTileConfigSize(), ST.getTileConfigAlignment(), false); + + auto TryToInsertAfter = [&](MIRef I) { + MachineBasicBlock *MBB = I->getParent(); + // Sink the inserting point along the chain with NeedTileCfgLiveIn = true if + // MBB doesn't postdominate all shapes. + while (!BBVisitedInfo[MBB].PostDominateAllShape) { + MachineBasicBlock *Next = nullptr; + // Limitation 1: + // We cannot sink it across any AMX instruction. + if (BBVisitedInfo[MBB].FirstAMX) + REPORT_CONFIG_FAIL; + std::for_each(MBB->succ_begin(), MBB->succ_end(), [&](auto *Succ) { + if (BBVisitedInfo[Succ].NeedTileCfgLiveIn) { + // Limitation 2: + // We cannot handle the chain being forked. It means we cannot find + // a point to dominate all AMX instructions. + // FIXME: We should improve it for the case that forked chains + // aggregate into one BB later before meeting AMX. + // E.g. amx-ldtilecfg-insert.ll:test3 without call in %if.true + // I'd like to fix it in another patch. + if (Next) + REPORT_CONFIG_FAIL; + Next = Succ; + I = MIRef(Next->getFirstNonPHI()); + } + }); + MBB = Next; + } + + // Limitation 3: + // Even MBB postdominates all shapes, we still need to check if there's AMX + // that intersects with shapes in the same MBB. + if (BBVisitedInfo[MBB].FirstAMX && + BBVisitedInfo[MBB].FirstAMX < BBVisitedInfo[MBB].LastShape) + REPORT_CONFIG_FAIL; + // Make sure we insert ldtilecfg after the last shape def in MBB. + if (I < BBVisitedInfo[MBB].LastShape) + I = BBVisitedInfo[MBB].LastShape; + // There're chances the MBB is sunk more than once. Record it to avoid multi + // insert. + if (Inserted.insert(I).second) + addFrameReference( + BuildMI(*MBB, ++I->getIterator(), DL, TII->get(X86::LDTILECFG)), SS); + }; + + for (auto I : CfgNeedInsert) + TryToInsertAfter(I); + + if (BBVisitedInfo[&MF.front()].NeedTileCfgLiveIn) + TryToInsertAfter(MIRef(MF.front().getFirstNonPHI())); + + MachineBasicBlock &MBB = MF.front(); + MachineInstr *MI = &*MBB.begin(); + if (ST.hasAVX512()) { Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) + BuildMI(MBB, MI, DL, TII->get(X86::VPXORDZrr), Zmm) .addReg(Zmm, RegState::Undef) .addReg(Zmm, RegState::Undef); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), - FrameIdx) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSZmr)), SS) .addReg(Zmm); - } else if (ST->hasAVX2()) { + } else if (ST.hasAVX2()) { Register Ymm = MRI->createVirtualRegister(&X86::VR256RegClass); - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORYrr), Ymm) + BuildMI(MBB, MI, DL, TII->get(X86::VPXORYrr), Ymm) .addReg(Ymm, RegState::Undef) .addReg(Ymm, RegState::Undef); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSYmr)), - FrameIdx) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS) .addReg(Ymm); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSYmr)), - FrameIdx, 32) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::VMOVUPSYmr)), SS, 32) .addReg(Ymm); } else { - assert(ST->hasSSE2() && "AMX should assume SSE2 enabled"); + assert(ST.hasSSE2() && "AMX should assume SSE2 enabled"); Register Xmm = MRI->createVirtualRegister(&X86::VR128RegClass); - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PXORrr), Xmm) + BuildMI(MBB, MI, DL, TII->get(X86::PXORrr), Xmm) .addReg(Xmm, RegState::Undef) .addReg(Xmm, RegState::Undef); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)), - FrameIdx) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS) .addReg(Xmm); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)), - FrameIdx, 16) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 16) .addReg(Xmm); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)), - FrameIdx, 32) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 32) .addReg(Xmm); - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::MOVUPSmr)), - FrameIdx, 48) + addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOVUPSmr)), SS, 48) .addReg(Xmm); } - // build psuedo ldtilecfg - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)), - FrameIdx); -} - -static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - llvm_unreachable("Unexpected machine instruction on tile"); - case X86::PTILELOADDV: - case X86::PTDPBSSDV: - case X86::PTDPBSUDV: - case X86::PTDPBUSDV: - case X86::PTDPBUUDV: - case X86::PTILEZEROV: - case X86::PTDPBF16PSV: - MachineOperand &MO1 = const_cast(MI.getOperand(1)); - MachineOperand &MO2 = const_cast(MI.getOperand(2)); - ShapeT Shape(&MO1, &MO2, MRI); - return Shape; - } -} - -MachineInstr *X86PreTileConfig::getTileConfigPoint() { - DenseMap PhysShapeInfo; - MachineBasicBlock *MBB = nullptr; - DenseSet MIs; - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - Register VirtReg = Register::index2VirtReg(i); - if (MRI->reg_nodbg_empty(VirtReg)) - continue; - const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); - if (RC.getID() != X86::TILERegClassID) - continue; - - // Find the common dominator for all MI that define tile register. - for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { - if (MO.isUndef()) - continue; - const auto *MI = MO.getParent(); - // PHI or IMPLICIT_DEF instructiion. - // There must be a input tile before PHI instruction. - if (MI->isTransient()) - continue; - if (!MBB) - MBB = const_cast(MI->getParent()); - MBB = DomTree->findNearestCommonDominator( - MBB, const_cast(MI->getParent())); - - // Collect the instructions that define shape. - ShapeT Shape = getShape(*MI, MRI); - std::array ShapeMOs = {Shape.getRow(), - Shape.getCol()}; - for (auto *ShapeMO : ShapeMOs) { - Register ShapeReg = ShapeMO->getReg(); - for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) { - const auto *ShapeMI = MO.getParent(); - MIs.insert(ShapeMI); - } - } - } - } - if (!MBB) - return nullptr; - // This pass is before the pass of eliminating PHI node, so it - // is in SSA form. - assert(MRI->isSSA() && "Not SSA form in pre-tile config"); - // Shape def should dominate tile config MBB. - // def s s1 s2 - // / \ \ / - // / \ \ / - // conf s3=phi(s1,s2) - // | - // c - // - for (const auto *MI : MIs) { - const MachineBasicBlock *ShapeMBB = MI->getParent(); - if (DomTree->dominates(ShapeMBB, MBB)) - continue; - if (MI->isMoveImmediate()) - continue; - report_fatal_error(MF->getName() + ": Failed to config tile register, " - "please define the shape earlier"); - } - - // ldtilecfg should be inserted after the MI that define the shape. - MachineBasicBlock::reverse_instr_iterator I, E; - for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) { - auto *MI = &*I; - if (MIs.count(MI) && (!MI->isMoveImmediate())) - break; - } - MachineBasicBlock::iterator MII; - if (I == E) - MII = MBB->getFirstNonPHI(); - else { - MII = MachineBasicBlock::iterator(&*I); - MII++; - } - return &*MII; -} - -static bool isAMXInstruction(MachineBasicBlock::iterator MII) { - switch (MII->getOpcode()) { - default: - return false; - case X86::PTILELOADDV: - case X86::PTILESTOREDV: - case X86::PTDPBSSDV: - case X86::PTDPBSUDV: - case X86::PTDPBUSDV: - case X86::PTDPBUUDV: - case X86::PTILEZEROV: - case X86::PTDPBF16PSV: - return true; - } -} - -struct BBInfo { - bool HasAMX = false; - bool HasCallBeforeAMX = false; - bool HasAMXBeforeCallInSuccs = false; - MachineInstr *LastCall = nullptr; - - BBInfo() = default; - BBInfo(SmallSet &CfgNeedInsert, MachineBasicBlock *MBB, - MachineInstr *MI = nullptr) { - MachineBasicBlock::iterator MII = MI ? MI->getIterator() : MBB->begin(); - for (auto E = MBB->end(); MII != E; ++MII) { - if (isAMXInstruction(MII)) { - HasAMX = true; - if (LastCall) - CfgNeedInsert.insert(LastCall); - } else if (MII->isCall()) { - LastCall = &*MII; - if (!HasAMX) - HasCallBeforeAMX = true; - } - } - } -}; - -static void reloadTileConfig(MachineInstr *MI, int FI, - const TargetInstrInfo *TII, - const TargetRegisterInfo *TRI) { - SmallSet CfgNeedInsert; - SmallVector WorkList; - DenseMap BBVisitedInfo; - - MachineBasicBlock *MBB = MI->getParent(); - BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI); - - WorkList.push_back(MBB); - while (!WorkList.empty()) { - MBB = WorkList.pop_back_val(); - for (auto I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) { - if (!BBVisitedInfo.count(*I)) { - BBVisitedInfo[*I] = BBInfo(CfgNeedInsert, *I); - WorkList.push_back(*I); - } - } - } - - WorkList.clear(); - for (auto I : BBVisitedInfo) { - WorkList.push_back(I.first); - while (!WorkList.empty()) { - MBB = WorkList.pop_back_val(); - if (BBVisitedInfo[MBB].HasCallBeforeAMX || - (!BBVisitedInfo[MBB].HasAMX && - !BBVisitedInfo[MBB].HasAMXBeforeCallInSuccs)) - continue; - for (auto I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) { - if (!BBVisitedInfo.count(*I) || - BBVisitedInfo[*I].HasAMXBeforeCallInSuccs) - continue; - if (BBVisitedInfo[*I].LastCall) - CfgNeedInsert.insert(BBVisitedInfo[*I].LastCall); - BBVisitedInfo[*I].HasAMXBeforeCallInSuccs = true; - WorkList.push_back(*I); - } - } - } - - for (auto *I : CfgNeedInsert) { - BitVector UsableRegs(TRI->getNumRegs()); - const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - for (unsigned J = 0; J < RC->getNumRegs(); J++) - UsableRegs.set(X86::TMM0 + J); - for (MachineOperand &CallMO : I->operands()) { - if (CallMO.isRegMask()) - UsableRegs.clearBitsInMask(CallMO.getRegMask()); - } - if (!UsableRegs.none()) - addFrameReference(BuildMI(*I->getParent(), ++I->getIterator(), DebugLoc(), - TII->get(X86::LDTILECFG)), - FI); - } -} - -bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - MRI = &mf.getRegInfo(); - ST = &mf.getSubtarget(); - TRI = ST->getRegisterInfo(); - TII = mf.getSubtarget().getInstrInfo(); - DomTree = &getAnalysis(); - - MachineInstr *MI = getTileConfigPoint(); - if (!MI) - return false; - unsigned Size = ST->getTileConfigSize(); - Align Alignment = ST->getTileConfigAlignment(); - int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - buildConfigMI(MI, SS, TII, MRI, ST); - reloadTileConfig(MI, SS, TII, TRI); return true; } diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -113,9 +113,10 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $3016, %rsp # imm = 0xBC8 ; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: callq foo ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) @@ -133,7 +134,6 @@ ; CHECK-NEXT: tileloadd (%r15,%r12), %tmm0 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax @@ -154,7 +154,6 @@ ; CHECK-NEXT: incl %r14d ; CHECK-NEXT: jmp .LBB2_8 ; CHECK-NEXT: .LBB2_4: -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $32, %eax @@ -180,13 +179,13 @@ ; IPRA: # %bb.0: ; IPRA-NEXT: subq $72, %rsp ; IPRA-NEXT: movl %edi, %eax -; IPRA-NEXT: callq foo ; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) ; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) ; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; IPRA-NEXT: callq foo ; IPRA-NEXT: testl %edi, %edi ; IPRA-NEXT: jg .LBB2_4 ; IPRA-NEXT: # %bb.1: # %.preheader @@ -273,12 +272,15 @@ ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $3024, %rsp # imm = 0xBD0 ; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf, %r14d ; CHECK-NEXT: movl $32, %r15d ; CHECK-NEXT: movw $8, %bp ; CHECK-NEXT: movl $buf+2048, %r12d ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) @@ -287,13 +289,9 @@ ; CHECK-NEXT: testl %ebx, %ebx ; CHECK-NEXT: jle .LBB3_3 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax @@ -314,6 +312,12 @@ ; IPRA-LABEL: test_loop2: ; IPRA: # %bb.0: ; IPRA-NEXT: subq $72, %rsp +; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; IPRA-NEXT: movl $buf, %eax ; IPRA-NEXT: movl $32, %ecx ; IPRA-NEXT: movw $8, %dx @@ -324,12 +328,6 @@ ; IPRA-NEXT: testl %edi, %edi ; IPRA-NEXT: jle .LBB3_3 ; IPRA-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 -; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) -; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 ; IPRA-NEXT: callq foo ; IPRA-NEXT: tilestored %tmm0, (%rsi,%rcx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-config.ll b/llvm/test/CodeGen/X86/AMX/amx-config.ll --- a/llvm/test/CodeGen/X86/AMX/amx-config.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-config.ll @@ -10,10 +10,10 @@ define dso_local void @test_api(i32 %0, i16 signext %1, i16 signext %2) { ; AVX512-LABEL: test_api: ; AVX512: # %bb.0: -; AVX512-NEXT: testl %edi, %edi -; AVX512-NEXT: movsbl %sil, %eax ; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: testl %edi, %edi +; AVX512-NEXT: movsbl %sil, %eax ; AVX512-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) @@ -43,11 +43,11 @@ ; ; AVX2-LABEL: test_api: ; AVX2: # %bb.0: -; AVX2-NEXT: testl %edi, %edi -; AVX2-NEXT: movsbl %sil, %eax ; AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: testl %edi, %edi +; AVX2-NEXT: movsbl %sil, %eax ; AVX2-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movw %si, -{{[0-9]+}}(%rsp) @@ -77,13 +77,13 @@ ; ; SSE2-LABEL: test_api: ; SSE2: # %bb.0: -; SSE2-NEXT: testl %edi, %edi -; SSE2-NEXT: movsbl %sil, %eax ; SSE2-NEXT: xorps %xmm0, %xmm0 ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: testl %edi, %edi +; SSE2-NEXT: movsbl %sil, %eax ; SSE2-NEXT: movb $1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movw %si, -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll @@ -47,6 +47,8 @@ ; CHECK-NEXT: movl %edi, %ebp ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) @@ -59,9 +61,6 @@ ; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB1_3 @@ -116,6 +115,60 @@ ret void } +define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind { +; CHECK-LABEL: test3: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $3048, %rsp # imm = 0xBE8 +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB2_2 +; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: incl %ebp +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: jmp .LBB2_3 +; CHECK-NEXT: .LBB2_2: # %if.false +; CHECK-NEXT: decl %ebp +; CHECK-NEXT: .LBB2_3: # %exit +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movl $32, %ecx +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx) +; CHECK-NEXT: addq $3048, %rsp # imm = 0xBE8 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + br i1 undef, label %if.true, label %if.false + +if.true: + %3 = add i16 %0, 1 + call void @foo() + br label %exit + +if.false: + %4 = sub i16 %0, 1 + br label %exit + +exit: + %5 = phi i16 [ %3, %if.true ], [ %4, %if.false ] + %6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1) + tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6) + ret void +} + declare dso_local void @foo() nounwind declare x86_amx @llvm.x86.tilezero.internal(i16, i16) declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -117,12 +117,12 @@ ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Tile Register Pre-configure ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: Process Implicit Definitions ; CHECK-NEXT: Remove unreachable machine basic blocks ; CHECK-NEXT: Live Variable Analysis -; CHECK-NEXT: Machine Natural Loop Construction ; CHECK-NEXT: Eliminate PHI nodes for register allocation ; CHECK-NEXT: Two-Address instruction pass ; CHECK-NEXT: Slot index numbering